rdig 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGES ADDED
@@ -0,0 +1,2 @@
1
+ 0.1.0
2
+ initial release
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2006 Jens Kraemer
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README ADDED
@@ -0,0 +1,61 @@
1
+ = RDig
2
+
3
+ RDig provides an HTTP crawler and content extraction utilities
4
+ to help building a site search for web sites or intranets. Internally,
5
+ Ferret is used for the full text indexing. After creating a config file
6
+ for your site, the index can be built with a single call to rdig.
7
+
8
+ RDig depends on Ferret (>= 0.3.2) and the RubyfulSoup library (>= 1.0.4).
9
+
10
+ == basic usage
11
+
12
+
13
+ === Index creation
14
+ - create a config file based on the template in doc/examples
15
+ - to create an index:
16
+ rdig -c CONFIGFILE
17
+ - to run a query against the index (just to try it out)
18
+ rdig -c CONFIGFILE -q 'your query'
19
+ this will dump the first 10 search results to STDOUT
20
+
21
+ === Handle search in your application:
22
+ require 'rdig'
23
+ require 'rdig_config' # load your config file here
24
+ search_results = RDig.searcher.search(query, options={})
25
+
26
+ see RDig::Search::Searcher for more information.
27
+
28
+
29
+ == usage in rails
30
+
31
+ - add to config/environment.rb :
32
+ require 'rdig'
33
+ require 'rdig_config'
34
+ - place rdig_config.rb into config/ directory.
35
+ - build index:
36
+ rdig -c config/rdig_config.rb
37
+ - in your controller that handles the search form:
38
+ search_results = RDig.searcher.search(params[:query])
39
+ @results = search_results[:list]
40
+ @hitcount = search_results[:hitcount]
41
+
42
+ === search result paging
43
+ Use the :first_doc and :num_docs options to implement
44
+ paging through search results.
45
+ (:num_docs is 10 by default, so without using these options only the first 10
46
+ results will be retrieved)
47
+
48
+
49
+ == sample configuration
50
+
51
+ from doc/examples/config.rb. The tag_selector properties are called
52
+ with a BeautifulSoup instance as parameter. See the RubyfulSoup Site[http://www.crummy.com/software/RubyfulSoup/documentation.html] for more info about this cool lib.
53
+ You can also have a look at the +html_content_extractor+ unit test.
54
+
55
+ See [] for API documentation of the
56
+ Rubyful Soup lib used
57
+
58
+ :include:doc/examples/config.rb
59
+
60
+
61
+
data/TODO ADDED
File without changes
data/bin/rdig ADDED
@@ -0,0 +1,32 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # run from RAILS_ROOT with
4
+ # ruby -Ilib vendor/plugins/sitesearch/create_index.rb config
5
+ # where config is the name of your config file
6
+
7
+ begin
8
+ require 'rdig'
9
+ rescue LoadError
10
+ require 'rubygems'
11
+ require 'rdig'
12
+ end
13
+ RDig.application.run
14
+
15
+
16
+ #$LOAD_PATH << File.expand_path(File.dirname(__FILE__) + "/lib")
17
+ #$LOAD_PATH << File.expand_path(File.dirname(__FILE__))
18
+ #require 'init'
19
+
20
+ #if ARGV[0]
21
+ # require ARGV[0]
22
+ #else
23
+ # require 'config'
24
+ #end
25
+
26
+ #include SiteSearch
27
+
28
+
29
+ #puts "creating new index in #{SiteSearch.settings[:index_dir]}"
30
+
31
+ #crawler = Crawler.new
32
+ #crawler.run
@@ -0,0 +1,53 @@
1
+ RDig.configuration do |cfg|
2
+
3
+ ##################################################################
4
+ # options you should really set
5
+
6
+ # provide one or more URLs for the crawler to start from
7
+ cfg.crawler.start_urls = [ 'http://www.example.com/' ]
8
+
9
+ # limit the crawl to these hosts. The crawler will never
10
+ # follow any links pointing to hosts other than those given here.
11
+ cfg.crawler.include_hosts = [ 'www.example.com' ]
12
+
13
+ # this is the path where the index will be stored
14
+ # caution, existing contents of this directory will be deleted!
15
+ cfg.ferret.path = '/path/to/index'
16
+
17
+ ##################################################################
18
+ # options you might want to set, the given values are the defaults
19
+
20
+ # content extraction options
21
+
22
+ # provide a method that selects the tag containing the title of a document
23
+ # cfg.content_extraction.html.title_tag_selector = lambda { |tagsoup| tagsoup.html.head.title }
24
+
25
+ # provide a method that selects the tag containing the page content you
26
+ # want to index. Useful to avoid indexing common elements like navigation
27
+ # and page footers for every page.
28
+ # cfg.content_extraction.html.content_tag_selector = lambda { |tagsoup| tagsoup.html.body }
29
+
30
+ # crawler options
31
+
32
+ # nil (index all documents) or an array of Regexps
33
+ # matching URLs you want to index.
34
+ # cfg.crawler.include_documents = nil
35
+
36
+ # nil (no documents excluded) or an array of Regexps
37
+ # matching URLs not to index.
38
+ # this filter is used after the one above, so you only need
39
+ # to exclude documents here that aren't wanted but would be
40
+ # included by the inclusion patterns.
41
+ # cfg.crawler.exclude_documents = nil
42
+
43
+ # number of http fetching threads to use
44
+ # cfg.crawler.num_threads = 2
45
+
46
+ # maximum number of http redirections to follow
47
+ # cfg.crawler.max_redirects = 5
48
+
49
+ # number of seconds to wait with an empty url queue before
50
+ # finishing the crawl. Set to a higher number for slow sites
51
+ # cfg.crawler.wait_before_leave = 10
52
+
53
+ end
data/install.rb ADDED
@@ -0,0 +1,89 @@
1
+ require 'rbconfig'
2
+ require 'find'
3
+ require 'ftools'
4
+
5
+ include Config
6
+
7
+ $ruby = CONFIG['ruby_install_name']
8
+
9
+ ##
10
+ # Install a binary file. We patch in on the way through to
11
+ # insert a #! line. If this is a Unix install, we name
12
+ # the command (for example) 'rdig' and let the shebang line
13
+ # handle running it. Under windows, we add a '.rb' extension
14
+ # and let file associations to their stuff
15
+ #
16
+ # based on install.rb from the Rake distribution
17
+
18
+ def installBIN(from, opfile)
19
+
20
+ tmp_dir = nil
21
+ for t in [".", "/tmp", "c:/temp", $bindir]
22
+ stat = File.stat(t) rescue next
23
+ if stat.directory? and stat.writable?
24
+ tmp_dir = t
25
+ break
26
+ end
27
+ end
28
+
29
+ fail "Cannot find a temporary directory" unless tmp_dir
30
+ tmp_file = File.join(tmp_dir, "_tmp")
31
+
32
+ File.open(from) do |ip|
33
+ File.open(tmp_file, "w") do |op|
34
+ ruby = File.join($realbindir, $ruby)
35
+ op.puts "#!#{ruby} -w"
36
+ op.write ip.read
37
+ end
38
+ end
39
+
40
+ opfile += ".rb" if CONFIG["target_os"] =~ /mswin/i
41
+ File::install(tmp_file, File.join($bindir, opfile), 0755, true)
42
+ File::unlink(tmp_file)
43
+ end
44
+
45
+ $sitedir = CONFIG["sitelibdir"]
46
+ unless $sitedir
47
+ version = CONFIG["MAJOR"]+"."+CONFIG["MINOR"]
48
+ $libdir = File.join(CONFIG["libdir"], "ruby", version)
49
+ $sitedir = $:.find {|x| x =~ /site_ruby/}
50
+ if !$sitedir
51
+ $sitedir = File.join($libdir, "site_ruby")
52
+ elsif $sitedir !~ Regexp.quote(version)
53
+ $sitedir = File.join($sitedir, version)
54
+ end
55
+ end
56
+
57
+ $bindir = CONFIG["bindir"]
58
+
59
+ $realbindir = $bindir
60
+
61
+ bindir = CONFIG["bindir"]
62
+ if (destdir = ENV['DESTDIR'])
63
+ $bindir = destdir + $bindir
64
+ $sitedir = destdir + $sitedir
65
+
66
+ File::makedirs($bindir)
67
+ File::makedirs($sitedir)
68
+ end
69
+
70
+ rdig_dest = File.join($sitedir, "rdig")
71
+ File::makedirs(rdig_dest, true)
72
+ File::chmod(0755, rdig_dest)
73
+
74
+ # The library files
75
+
76
+ files = Dir.chdir('lib') { Dir['**/*.rb'] }
77
+
78
+ for fn in files
79
+ fn_dir = File.dirname(fn)
80
+ target_dir = File.join($sitedir, fn_dir)
81
+ if ! File.exist?(target_dir)
82
+ File.makedirs(target_dir)
83
+ end
84
+ File::install(File.join('lib', fn), File.join($sitedir, fn), 0644, true)
85
+ end
86
+
87
+ # and the executable
88
+
89
+ installBIN("bin/rdig", "rdig")
@@ -0,0 +1,21 @@
1
+ == 2.2 (2005-11-07)
2
+ * Important bug fixes -- thanks to Moonwolf
3
+ * Decoding hexadecimal entities now accepts 'f' as a hex digit. (D'oh!)
4
+ * Decimal decoding edge cases addressed.
5
+ * Test cases added.
6
+
7
+ == 2.1 (2005-10-31)
8
+ * Removed some unnecessary code in basic entity encoding.
9
+ * Improved handling of encoding: commands are now automatically sorted, so the
10
+ user doesn't have to worry about their order.
11
+ * Now using setup.rb.
12
+ * Tests moved to separate file.
13
+
14
+ == 2.0 (2005-08-23)
15
+ * Added encoding to entities.
16
+ * Decoding interface unchanged.
17
+ * Fixed a bug with handling high codepoints.
18
+
19
+ == 1.0 (2005-08-03)
20
+ * Initial release.
21
+ * Decoding only.
@@ -0,0 +1,7 @@
1
+ Copyright (c) 2005 Paul Battley
2
+
3
+ Usage of the works is permitted provided that this instrument is retained
4
+ with the works, so that any entity that uses the works is notified of this
5
+ instrument.
6
+
7
+ DISCLAIMER: THE WORKS ARE WITHOUT WARRANTY.
@@ -0,0 +1,15 @@
1
+ HTML entity encoding and decoding for Ruby
2
+
3
+ This library extends the String class to allow encoding and decoding of
4
+ HTML/XML entities from/to their corresponding UTF-8 codepoints.
5
+
6
+ To install (requires root/admin privileges):
7
+
8
+ # ruby setup.rb
9
+
10
+ To test:
11
+
12
+ $ ruby setup.rb test
13
+
14
+ Comments are welcome. Send an email to pbattley @ gmail.com.
15
+
@@ -0,0 +1,281 @@
1
+ #
2
+ # HTML entity encoding and decoding for Ruby
3
+ #
4
+ # Author:: Paul BATTLEY (pbattley @ gmail.com)
5
+ # Version:: 2.2
6
+ # Date:: 2005-11-07
7
+ #
8
+ # == About
9
+ #
10
+ # This library extends the String class to allow encoding and decoding of
11
+ # HTML/XML entities from/to their corresponding UTF-8 codepoints.
12
+ #
13
+ # == Licence
14
+ #
15
+ # Copyright (c) 2005 Paul Battley
16
+ #
17
+ # Usage of the works is permitted provided that this instrument is retained
18
+ # with the works, so that any entity that uses the works is notified of this
19
+ # instrument.
20
+ #
21
+ # DISCLAIMER: THE WORKS ARE WITHOUT WARRANTY.
22
+ #
23
+
24
+ module HTMLEntities
25
+
26
+ VERSION = '2.2'
27
+
28
+ #
29
+ # MAP is a hash of all the HTML entities I could discover, as taken
30
+ # from the w3schools page on the subject:
31
+ # http://www.w3schools.com/html/html_entitiesref.asp
32
+ # The format is 'entity name' => codepoint where entity name is given
33
+ # without the surrounding ampersand and semicolon.
34
+ #
35
+ MAP = {
36
+ 'quot' => 34,
37
+ 'apos' => 39,
38
+ 'amp' => 38,
39
+ 'lt' => 60,
40
+ 'gt' => 62,
41
+ 'nbsp' => 160,
42
+ 'iexcl' => 161,
43
+ 'curren' => 164,
44
+ 'cent' => 162,
45
+ 'pound' => 163,
46
+ 'yen' => 165,
47
+ 'brvbar' => 166,
48
+ 'sect' => 167,
49
+ 'uml' => 168,
50
+ 'copy' => 169,
51
+ 'ordf' => 170,
52
+ 'laquo' => 171,
53
+ 'not' => 172,
54
+ 'shy' => 173,
55
+ 'reg' => 174,
56
+ 'trade' => 8482,
57
+ 'macr' => 175,
58
+ 'deg' => 176,
59
+ 'plusmn' => 177,
60
+ 'sup2' => 178,
61
+ 'sup3' => 179,
62
+ 'acute' => 180,
63
+ 'micro' => 181,
64
+ 'para' => 182,
65
+ 'middot' => 183,
66
+ 'cedil' => 184,
67
+ 'sup1' => 185,
68
+ 'ordm' => 186,
69
+ 'raquo' => 187,
70
+ 'frac14' => 188,
71
+ 'frac12' => 189,
72
+ 'frac34' => 190,
73
+ 'iquest' => 191,
74
+ 'times' => 215,
75
+ 'divide' => 247,
76
+ 'Agrave' => 192,
77
+ 'Aacute' => 193,
78
+ 'Acirc' => 194,
79
+ 'Atilde' => 195,
80
+ 'Auml' => 196,
81
+ 'Aring' => 197,
82
+ 'AElig' => 198,
83
+ 'Ccedil' => 199,
84
+ 'Egrave' => 200,
85
+ 'Eacute' => 201,
86
+ 'Ecirc' => 202,
87
+ 'Euml' => 203,
88
+ 'Igrave' => 204,
89
+ 'Iacute' => 205,
90
+ 'Icirc' => 206,
91
+ 'Iuml' => 207,
92
+ 'ETH' => 208,
93
+ 'Ntilde' => 209,
94
+ 'Ograve' => 210,
95
+ 'Oacute' => 211,
96
+ 'Ocirc' => 212,
97
+ 'Otilde' => 213,
98
+ 'Ouml' => 214,
99
+ 'Oslash' => 216,
100
+ 'Ugrave' => 217,
101
+ 'Uacute' => 218,
102
+ 'Ucirc' => 219,
103
+ 'Uuml' => 220,
104
+ 'Yacute' => 221,
105
+ 'THORN' => 222,
106
+ 'szlig' => 223,
107
+ 'agrave' => 224,
108
+ 'aacute' => 225,
109
+ 'acirc' => 226,
110
+ 'atilde' => 227,
111
+ 'auml' => 228,
112
+ 'aring' => 229,
113
+ 'aelig' => 230,
114
+ 'ccedil' => 231,
115
+ 'egrave' => 232,
116
+ 'eacute' => 233,
117
+ 'ecirc' => 234,
118
+ 'euml' => 235,
119
+ 'igrave' => 236,
120
+ 'iacute' => 237,
121
+ 'icirc' => 238,
122
+ 'iuml' => 239,
123
+ 'eth' => 240,
124
+ 'ntilde' => 241,
125
+ 'ograve' => 242,
126
+ 'oacute' => 243,
127
+ 'ocirc' => 244,
128
+ 'otilde' => 245,
129
+ 'ouml' => 246,
130
+ 'oslash' => 248,
131
+ 'ugrave' => 249,
132
+ 'uacute' => 250,
133
+ 'ucirc' => 251,
134
+ 'uuml' => 252,
135
+ 'yacute' => 253,
136
+ 'thorn' => 254,
137
+ 'yuml' => 255,
138
+ 'OElig' => 338,
139
+ 'oelig' => 339,
140
+ 'Scaron' => 352,
141
+ 'scaron' => 353,
142
+ 'Yuml' => 376,
143
+ 'circ' => 710,
144
+ 'tilde' => 732,
145
+ 'ensp' => 8194,
146
+ 'emsp' => 8195,
147
+ 'thinsp' => 8201,
148
+ 'zwnj' => 8204,
149
+ 'zwj' => 8205,
150
+ 'lrm' => 8206,
151
+ 'rlm' => 8207,
152
+ 'ndash' => 8211,
153
+ 'mdash' => 8212,
154
+ 'lsquo' => 8216,
155
+ 'rsquo' => 8217,
156
+ 'sbquo' => 8218,
157
+ 'ldquo' => 8220,
158
+ 'rdquo' => 8221,
159
+ 'bdquo' => 8222,
160
+ 'dagger' => 8224,
161
+ 'Dagger' => 8225,
162
+ 'hellip' => 8230,
163
+ 'permil' => 8240,
164
+ 'lsaquo' => 8249,
165
+ 'rsaquo' => 8250,
166
+ 'euro' => 8364
167
+ }
168
+
169
+ MIN_LENGTH = MAP.keys.map{ |a| a.length }.min
170
+ MAX_LENGTH = MAP.keys.map{ |a| a.length }.max
171
+
172
+ # Precompile the regexp
173
+ NAMED_ENTITY_REGEXP =
174
+ /&([a-z]{#{HTMLEntities::MIN_LENGTH},#{HTMLEntities::MAX_LENGTH}});/i
175
+
176
+ # Reverse map for converting characters to named entities
177
+ REVERSE_MAP = MAP.invert
178
+
179
+ BASIC_ENTITY_REGEXP = /[<>'"&]/
180
+
181
+ UTF8_NON_ASCII_REGEXP = /[\x00-\x1f]|[\xc0-\xfd][\x80-\xbf]+/
182
+
183
+ end
184
+
185
+ class String
186
+
187
+ # Because there's no need to make the user worry about the order here,
188
+ # let's handle it.
189
+ ENCODE_ENTITIES_COMMAND_ORDER = {
190
+ :basic => 0,
191
+ :named => 1,
192
+ :decimal => 2,
193
+ :hexadecimal => 3
194
+ }
195
+
196
+ #
197
+ # Decode XML and HTML 4.01 entities in a string into their UTF-8
198
+ # equivalents. Obviously, if your string is not already in UTF-8, you'd
199
+ # better convert it before using this method, or the output will be mixed
200
+ # up.
201
+ # Unknown named entities are not converted
202
+ #
203
+ def decode_entities
204
+ return gsub(HTMLEntities::NAMED_ENTITY_REGEXP) {
205
+ HTMLEntities::MAP.has_key?($1) ? [HTMLEntities::MAP[$1]].pack('U') : $&
206
+ }.gsub(/&#([0-9]{1,7});/) {
207
+ [$1.to_i].pack('U')
208
+ }.gsub(/&#x([0-9a-f]{1,6});/i) {
209
+ [$1.to_i(16)].pack('U')
210
+ }
211
+ end
212
+
213
+ #
214
+ # Encode codepoints into their corresponding entities. Various operations
215
+ # are possible, and may be specified in order:
216
+ #
217
+ # :basic :: Convert the five XML entities ('"<>&)
218
+ # :named :: Convert non-ASCII characters to their named HTML 4.01 equivalent
219
+ # :decimal :: Convert non-ASCII characters to decimal entities (e.g. &#1234;)
220
+ # :hexadecimal :: Convert non-ASCII characters to hexadecimal entities (e.g. # &#x12ab;)
221
+ #
222
+ # You can specify the commands in any order, but they will be executed in
223
+ # the order listed above to ensure that entity ampersands are not
224
+ # clobbered and that named entities are replaced before numeric ones.
225
+ #
226
+ # If no instructions are specified, :basic will be used.
227
+ #
228
+ # Examples:
229
+ # str.encode_entities - XML-safe
230
+ # str.encode_entities(:basic, :decimal) - XML-safe and 7-bit clean
231
+ # str.encode_entities(:basic, :named, :decimal) - 7-bit clean, with all
232
+ # non-ASCII characters replaced with their named entity where possible, and
233
+ # decimal equivalents otherwise.
234
+ #
235
+ # Note: It is the program's responsibility to ensure that the string
236
+ # contains valid UTF-8 before calling this method.
237
+ #
238
+ def encode_entities(*instructions)
239
+ str = nil
240
+ if (instructions.empty?)
241
+ instructions = [:basic]
242
+ else
243
+ instructions.each do |instr|
244
+ unless ENCODE_ENTITIES_COMMAND_ORDER[instr]
245
+ raise RuntimeError, "unknown encode_entities command `#{instr.inspect}'"
246
+ end
247
+ end
248
+ instructions.sort! { |a,b|
249
+ ENCODE_ENTITIES_COMMAND_ORDER[a] <=>
250
+ ENCODE_ENTITIES_COMMAND_ORDER[b]
251
+ }
252
+ end
253
+ instructions.each do |instruction|
254
+ case instruction
255
+ when :basic
256
+ # Handled as basic ASCII
257
+ str = (str || self).gsub(HTMLEntities::BASIC_ENTITY_REGEXP) {
258
+ # It's safe to use the simpler [0] here because we know
259
+ # that the basic entities are ASCII.
260
+ '&' << HTMLEntities::REVERSE_MAP[$&[0]] << ';'
261
+ }
262
+ when :named
263
+ # Test everything except printable ASCII
264
+ str = (str || self).gsub(HTMLEntities::UTF8_NON_ASCII_REGEXP) {
265
+ cp = $&.unpack('U')[0]
266
+ (e = HTMLEntities::REVERSE_MAP[cp]) ? "&#{e};" : $&
267
+ }
268
+ when :decimal
269
+ str = (str || self).gsub(HTMLEntities::UTF8_NON_ASCII_REGEXP) {
270
+ "&##{$&.unpack('U')[0]};"
271
+ }
272
+ when :hexadecimal
273
+ str = (str || self).gsub(HTMLEntities::UTF8_NON_ASCII_REGEXP) {
274
+ "&#x#{$&.unpack('U')[0].to_s(16)};"
275
+ }
276
+ end
277
+ end
278
+ return str
279
+ end
280
+
281
+ end