rdig 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGES ADDED
@@ -0,0 +1,2 @@
1
+ 0.1.0
2
+ initial release
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2006 Jens Kraemer
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README ADDED
@@ -0,0 +1,61 @@
1
+ = RDig
2
+
3
+ RDig provides an HTTP crawler and content extraction utilities
4
+ to help building a site search for web sites or intranets. Internally,
5
+ Ferret is used for the full text indexing. After creating a config file
6
+ for your site, the index can be built with a single call to rdig.
7
+
8
+ RDig depends on Ferret (>= 0.3.2) and the RubyfulSoup library (>= 1.0.4).
9
+
10
+ == basic usage
11
+
12
+
13
+ === Index creation
14
+ - create a config file based on the template in doc/examples
15
+ - to create an index:
16
+ rdig -c CONFIGFILE
17
+ - to run a query against the index (just to try it out)
18
+ rdig -c CONFIGFILE -q 'your query'
19
+ this will dump the first 10 search results to STDOUT
20
+
21
+ === Handle search in your application:
22
+ require 'rdig'
23
+ require 'rdig_config' # load your config file here
24
+ search_results = RDig.searcher.search(query, options={})
25
+
26
+ see RDig::Search::Searcher for more information.
27
+
28
+
29
+ == usage in rails
30
+
31
+ - add to config/environment.rb :
32
+ require 'rdig'
33
+ require 'rdig_config'
34
+ - place rdig_config.rb into config/ directory.
35
+ - build index:
36
+ rdig -c config/rdig_config.rb
37
+ - in your controller that handles the search form:
38
+ search_results = RDig.searcher.search(params[:query])
39
+ @results = search_results[:list]
40
+ @hitcount = search_results[:hitcount]
41
+
42
+ === search result paging
43
+ Use the :first_doc and :num_docs options to implement
44
+ paging through search results.
45
+ (:num_docs is 10 by default, so without using these options only the first 10
46
+ results will be retrieved)
47
+
48
+
49
+ == sample configuration
50
+
51
+ from doc/examples/config.rb. The tag_selector properties are called
52
+ with a BeautifulSoup instance as parameter. See the RubyfulSoup Site[http://www.crummy.com/software/RubyfulSoup/documentation.html] for more info about this cool lib.
53
+ You can also have a look at the +html_content_extractor+ unit test.
54
+
55
+ See [] for API documentation of the
56
+ Rubyful Soup lib used
57
+
58
+ :include:doc/examples/config.rb
59
+
60
+
61
+
data/TODO ADDED
File without changes
data/bin/rdig ADDED
@@ -0,0 +1,32 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # run from RAILS_ROOT with
4
+ # ruby -Ilib vendor/plugins/sitesearch/create_index.rb config
5
+ # where config is the name of your config file
6
+
7
+ begin
8
+ require 'rdig'
9
+ rescue LoadError
10
+ require 'rubygems'
11
+ require 'rdig'
12
+ end
13
+ RDig.application.run
14
+
15
+
16
+ #$LOAD_PATH << File.expand_path(File.dirname(__FILE__) + "/lib")
17
+ #$LOAD_PATH << File.expand_path(File.dirname(__FILE__))
18
+ #require 'init'
19
+
20
+ #if ARGV[0]
21
+ # require ARGV[0]
22
+ #else
23
+ # require 'config'
24
+ #end
25
+
26
+ #include SiteSearch
27
+
28
+
29
+ #puts "creating new index in #{SiteSearch.settings[:index_dir]}"
30
+
31
+ #crawler = Crawler.new
32
+ #crawler.run
@@ -0,0 +1,53 @@
1
+ RDig.configuration do |cfg|
2
+
3
+ ##################################################################
4
+ # options you should really set
5
+
6
+ # provide one or more URLs for the crawler to start from
7
+ cfg.crawler.start_urls = [ 'http://www.example.com/' ]
8
+
9
+ # limit the crawl to these hosts. The crawler will never
10
+ # follow any links pointing to hosts other than those given here.
11
+ cfg.crawler.include_hosts = [ 'www.example.com' ]
12
+
13
+ # this is the path where the index will be stored
14
+ # caution, existing contents of this directory will be deleted!
15
+ cfg.ferret.path = '/path/to/index'
16
+
17
+ ##################################################################
18
+ # options you might want to set, the given values are the defaults
19
+
20
+ # content extraction options
21
+
22
+ # provide a method that selects the tag containing the title of a document
23
+ # cfg.content_extraction.html.title_tag_selector = lambda { |tagsoup| tagsoup.html.head.title }
24
+
25
+ # provide a method that selects the tag containing the page content you
26
+ # want to index. Useful to avoid indexing common elements like navigation
27
+ # and page footers for every page.
28
+ # cfg.content_extraction.html.content_tag_selector = lambda { |tagsoup| tagsoup.html.body }
29
+
30
+ # crawler options
31
+
32
+ # nil (index all documents) or an array of Regexps
33
+ # matching URLs you want to index.
34
+ # cfg.crawler.include_documents = nil
35
+
36
+ # nil (no documents excluded) or an array of Regexps
37
+ # matching URLs not to index.
38
+ # this filter is used after the one above, so you only need
39
+ # to exclude documents here that aren't wanted but would be
40
+ # included by the inclusion patterns.
41
+ # cfg.crawler.exclude_documents = nil
42
+
43
+ # number of http fetching threads to use
44
+ # cfg.crawler.num_threads = 2
45
+
46
+ # maximum number of http redirections to follow
47
+ # cfg.crawler.max_redirects = 5
48
+
49
+ # number of seconds to wait with an empty url queue before
50
+ # finishing the crawl. Set to a higher number for slow sites
51
+ # cfg.crawler.wait_before_leave = 10
52
+
53
+ end
data/install.rb ADDED
@@ -0,0 +1,89 @@
1
+ require 'rbconfig'
2
+ require 'find'
3
+ require 'ftools'
4
+
5
+ include Config
6
+
7
+ $ruby = CONFIG['ruby_install_name']
8
+
9
+ ##
10
+ # Install a binary file. We patch in on the way through to
11
+ # insert a #! line. If this is a Unix install, we name
12
+ # the command (for example) 'rdig' and let the shebang line
13
+ # handle running it. Under windows, we add a '.rb' extension
14
+ # and let file associations to their stuff
15
+ #
16
+ # based on install.rb from the Rake distribution
17
+
18
+ def installBIN(from, opfile)
19
+
20
+ tmp_dir = nil
21
+ for t in [".", "/tmp", "c:/temp", $bindir]
22
+ stat = File.stat(t) rescue next
23
+ if stat.directory? and stat.writable?
24
+ tmp_dir = t
25
+ break
26
+ end
27
+ end
28
+
29
+ fail "Cannot find a temporary directory" unless tmp_dir
30
+ tmp_file = File.join(tmp_dir, "_tmp")
31
+
32
+ File.open(from) do |ip|
33
+ File.open(tmp_file, "w") do |op|
34
+ ruby = File.join($realbindir, $ruby)
35
+ op.puts "#!#{ruby} -w"
36
+ op.write ip.read
37
+ end
38
+ end
39
+
40
+ opfile += ".rb" if CONFIG["target_os"] =~ /mswin/i
41
+ File::install(tmp_file, File.join($bindir, opfile), 0755, true)
42
+ File::unlink(tmp_file)
43
+ end
44
+
45
+ $sitedir = CONFIG["sitelibdir"]
46
+ unless $sitedir
47
+ version = CONFIG["MAJOR"]+"."+CONFIG["MINOR"]
48
+ $libdir = File.join(CONFIG["libdir"], "ruby", version)
49
+ $sitedir = $:.find {|x| x =~ /site_ruby/}
50
+ if !$sitedir
51
+ $sitedir = File.join($libdir, "site_ruby")
52
+ elsif $sitedir !~ Regexp.quote(version)
53
+ $sitedir = File.join($sitedir, version)
54
+ end
55
+ end
56
+
57
+ $bindir = CONFIG["bindir"]
58
+
59
+ $realbindir = $bindir
60
+
61
+ bindir = CONFIG["bindir"]
62
+ if (destdir = ENV['DESTDIR'])
63
+ $bindir = destdir + $bindir
64
+ $sitedir = destdir + $sitedir
65
+
66
+ File::makedirs($bindir)
67
+ File::makedirs($sitedir)
68
+ end
69
+
70
+ rdig_dest = File.join($sitedir, "rdig")
71
+ File::makedirs(rdig_dest, true)
72
+ File::chmod(0755, rdig_dest)
73
+
74
+ # The library files
75
+
76
+ files = Dir.chdir('lib') { Dir['**/*.rb'] }
77
+
78
+ for fn in files
79
+ fn_dir = File.dirname(fn)
80
+ target_dir = File.join($sitedir, fn_dir)
81
+ if ! File.exist?(target_dir)
82
+ File.makedirs(target_dir)
83
+ end
84
+ File::install(File.join('lib', fn), File.join($sitedir, fn), 0644, true)
85
+ end
86
+
87
+ # and the executable
88
+
89
+ installBIN("bin/rdig", "rdig")
@@ -0,0 +1,21 @@
1
+ == 2.2 (2005-11-07)
2
+ * Important bug fixes -- thanks to Moonwolf
3
+ * Decoding hexadecimal entities now accepts 'f' as a hex digit. (D'oh!)
4
+ * Decimal decoding edge cases addressed.
5
+ * Test cases added.
6
+
7
+ == 2.1 (2005-10-31)
8
+ * Removed some unnecessary code in basic entity encoding.
9
+ * Improved handling of encoding: commands are now automatically sorted, so the
10
+ user doesn't have to worry about their order.
11
+ * Now using setup.rb.
12
+ * Tests moved to separate file.
13
+
14
+ == 2.0 (2005-08-23)
15
+ * Added encoding to entities.
16
+ * Decoding interface unchanged.
17
+ * Fixed a bug with handling high codepoints.
18
+
19
+ == 1.0 (2005-08-03)
20
+ * Initial release.
21
+ * Decoding only.
@@ -0,0 +1,7 @@
1
+ Copyright (c) 2005 Paul Battley
2
+
3
+ Usage of the works is permitted provided that this instrument is retained
4
+ with the works, so that any entity that uses the works is notified of this
5
+ instrument.
6
+
7
+ DISCLAIMER: THE WORKS ARE WITHOUT WARRANTY.
@@ -0,0 +1,15 @@
1
+ HTML entity encoding and decoding for Ruby
2
+
3
+ This library extends the String class to allow encoding and decoding of
4
+ HTML/XML entities from/to their corresponding UTF-8 codepoints.
5
+
6
+ To install (requires root/admin privileges):
7
+
8
+ # ruby setup.rb
9
+
10
+ To test:
11
+
12
+ $ ruby setup.rb test
13
+
14
+ Comments are welcome. Send an email to pbattley @ gmail.com.
15
+
@@ -0,0 +1,281 @@
1
+ #
2
+ # HTML entity encoding and decoding for Ruby
3
+ #
4
+ # Author:: Paul BATTLEY (pbattley @ gmail.com)
5
+ # Version:: 2.2
6
+ # Date:: 2005-11-07
7
+ #
8
+ # == About
9
+ #
10
+ # This library extends the String class to allow encoding and decoding of
11
+ # HTML/XML entities from/to their corresponding UTF-8 codepoints.
12
+ #
13
+ # == Licence
14
+ #
15
+ # Copyright (c) 2005 Paul Battley
16
+ #
17
+ # Usage of the works is permitted provided that this instrument is retained
18
+ # with the works, so that any entity that uses the works is notified of this
19
+ # instrument.
20
+ #
21
+ # DISCLAIMER: THE WORKS ARE WITHOUT WARRANTY.
22
+ #
23
+
24
+ module HTMLEntities
25
+
26
+ VERSION = '2.2'
27
+
28
+ #
29
+ # MAP is a hash of all the HTML entities I could discover, as taken
30
+ # from the w3schools page on the subject:
31
+ # http://www.w3schools.com/html/html_entitiesref.asp
32
+ # The format is 'entity name' => codepoint where entity name is given
33
+ # without the surrounding ampersand and semicolon.
34
+ #
35
+ MAP = {
36
+ 'quot' => 34,
37
+ 'apos' => 39,
38
+ 'amp' => 38,
39
+ 'lt' => 60,
40
+ 'gt' => 62,
41
+ 'nbsp' => 160,
42
+ 'iexcl' => 161,
43
+ 'curren' => 164,
44
+ 'cent' => 162,
45
+ 'pound' => 163,
46
+ 'yen' => 165,
47
+ 'brvbar' => 166,
48
+ 'sect' => 167,
49
+ 'uml' => 168,
50
+ 'copy' => 169,
51
+ 'ordf' => 170,
52
+ 'laquo' => 171,
53
+ 'not' => 172,
54
+ 'shy' => 173,
55
+ 'reg' => 174,
56
+ 'trade' => 8482,
57
+ 'macr' => 175,
58
+ 'deg' => 176,
59
+ 'plusmn' => 177,
60
+ 'sup2' => 178,
61
+ 'sup3' => 179,
62
+ 'acute' => 180,
63
+ 'micro' => 181,
64
+ 'para' => 182,
65
+ 'middot' => 183,
66
+ 'cedil' => 184,
67
+ 'sup1' => 185,
68
+ 'ordm' => 186,
69
+ 'raquo' => 187,
70
+ 'frac14' => 188,
71
+ 'frac12' => 189,
72
+ 'frac34' => 190,
73
+ 'iquest' => 191,
74
+ 'times' => 215,
75
+ 'divide' => 247,
76
+ 'Agrave' => 192,
77
+ 'Aacute' => 193,
78
+ 'Acirc' => 194,
79
+ 'Atilde' => 195,
80
+ 'Auml' => 196,
81
+ 'Aring' => 197,
82
+ 'AElig' => 198,
83
+ 'Ccedil' => 199,
84
+ 'Egrave' => 200,
85
+ 'Eacute' => 201,
86
+ 'Ecirc' => 202,
87
+ 'Euml' => 203,
88
+ 'Igrave' => 204,
89
+ 'Iacute' => 205,
90
+ 'Icirc' => 206,
91
+ 'Iuml' => 207,
92
+ 'ETH' => 208,
93
+ 'Ntilde' => 209,
94
+ 'Ograve' => 210,
95
+ 'Oacute' => 211,
96
+ 'Ocirc' => 212,
97
+ 'Otilde' => 213,
98
+ 'Ouml' => 214,
99
+ 'Oslash' => 216,
100
+ 'Ugrave' => 217,
101
+ 'Uacute' => 218,
102
+ 'Ucirc' => 219,
103
+ 'Uuml' => 220,
104
+ 'Yacute' => 221,
105
+ 'THORN' => 222,
106
+ 'szlig' => 223,
107
+ 'agrave' => 224,
108
+ 'aacute' => 225,
109
+ 'acirc' => 226,
110
+ 'atilde' => 227,
111
+ 'auml' => 228,
112
+ 'aring' => 229,
113
+ 'aelig' => 230,
114
+ 'ccedil' => 231,
115
+ 'egrave' => 232,
116
+ 'eacute' => 233,
117
+ 'ecirc' => 234,
118
+ 'euml' => 235,
119
+ 'igrave' => 236,
120
+ 'iacute' => 237,
121
+ 'icirc' => 238,
122
+ 'iuml' => 239,
123
+ 'eth' => 240,
124
+ 'ntilde' => 241,
125
+ 'ograve' => 242,
126
+ 'oacute' => 243,
127
+ 'ocirc' => 244,
128
+ 'otilde' => 245,
129
+ 'ouml' => 246,
130
+ 'oslash' => 248,
131
+ 'ugrave' => 249,
132
+ 'uacute' => 250,
133
+ 'ucirc' => 251,
134
+ 'uuml' => 252,
135
+ 'yacute' => 253,
136
+ 'thorn' => 254,
137
+ 'yuml' => 255,
138
+ 'OElig' => 338,
139
+ 'oelig' => 339,
140
+ 'Scaron' => 352,
141
+ 'scaron' => 353,
142
+ 'Yuml' => 376,
143
+ 'circ' => 710,
144
+ 'tilde' => 732,
145
+ 'ensp' => 8194,
146
+ 'emsp' => 8195,
147
+ 'thinsp' => 8201,
148
+ 'zwnj' => 8204,
149
+ 'zwj' => 8205,
150
+ 'lrm' => 8206,
151
+ 'rlm' => 8207,
152
+ 'ndash' => 8211,
153
+ 'mdash' => 8212,
154
+ 'lsquo' => 8216,
155
+ 'rsquo' => 8217,
156
+ 'sbquo' => 8218,
157
+ 'ldquo' => 8220,
158
+ 'rdquo' => 8221,
159
+ 'bdquo' => 8222,
160
+ 'dagger' => 8224,
161
+ 'Dagger' => 8225,
162
+ 'hellip' => 8230,
163
+ 'permil' => 8240,
164
+ 'lsaquo' => 8249,
165
+ 'rsaquo' => 8250,
166
+ 'euro' => 8364
167
+ }
168
+
169
+ MIN_LENGTH = MAP.keys.map{ |a| a.length }.min
170
+ MAX_LENGTH = MAP.keys.map{ |a| a.length }.max
171
+
172
+ # Precompile the regexp
173
+ NAMED_ENTITY_REGEXP =
174
+ /&([a-z]{#{HTMLEntities::MIN_LENGTH},#{HTMLEntities::MAX_LENGTH}});/i
175
+
176
+ # Reverse map for converting characters to named entities
177
+ REVERSE_MAP = MAP.invert
178
+
179
+ BASIC_ENTITY_REGEXP = /[<>'"&]/
180
+
181
+ UTF8_NON_ASCII_REGEXP = /[\x00-\x1f]|[\xc0-\xfd][\x80-\xbf]+/
182
+
183
+ end
184
+
185
+ class String
186
+
187
+ # Because there's no need to make the user worry about the order here,
188
+ # let's handle it.
189
+ ENCODE_ENTITIES_COMMAND_ORDER = {
190
+ :basic => 0,
191
+ :named => 1,
192
+ :decimal => 2,
193
+ :hexadecimal => 3
194
+ }
195
+
196
+ #
197
+ # Decode XML and HTML 4.01 entities in a string into their UTF-8
198
+ # equivalents. Obviously, if your string is not already in UTF-8, you'd
199
+ # better convert it before using this method, or the output will be mixed
200
+ # up.
201
+ # Unknown named entities are not converted
202
+ #
203
+ def decode_entities
204
+ return gsub(HTMLEntities::NAMED_ENTITY_REGEXP) {
205
+ HTMLEntities::MAP.has_key?($1) ? [HTMLEntities::MAP[$1]].pack('U') : $&
206
+ }.gsub(/&#([0-9]{1,7});/) {
207
+ [$1.to_i].pack('U')
208
+ }.gsub(/&#x([0-9a-f]{1,6});/i) {
209
+ [$1.to_i(16)].pack('U')
210
+ }
211
+ end
212
+
213
+ #
214
+ # Encode codepoints into their corresponding entities. Various operations
215
+ # are possible, and may be specified in order:
216
+ #
217
+ # :basic :: Convert the five XML entities ('"<>&)
218
+ # :named :: Convert non-ASCII characters to their named HTML 4.01 equivalent
219
+ # :decimal :: Convert non-ASCII characters to decimal entities (e.g. &#1234;)
220
+ # :hexadecimal :: Convert non-ASCII characters to hexadecimal entities (e.g. # &#x12ab;)
221
+ #
222
+ # You can specify the commands in any order, but they will be executed in
223
+ # the order listed above to ensure that entity ampersands are not
224
+ # clobbered and that named entities are replaced before numeric ones.
225
+ #
226
+ # If no instructions are specified, :basic will be used.
227
+ #
228
+ # Examples:
229
+ # str.encode_entities - XML-safe
230
+ # str.encode_entities(:basic, :decimal) - XML-safe and 7-bit clean
231
+ # str.encode_entities(:basic, :named, :decimal) - 7-bit clean, with all
232
+ # non-ASCII characters replaced with their named entity where possible, and
233
+ # decimal equivalents otherwise.
234
+ #
235
+ # Note: It is the program's responsibility to ensure that the string
236
+ # contains valid UTF-8 before calling this method.
237
+ #
238
+ def encode_entities(*instructions)
239
+ str = nil
240
+ if (instructions.empty?)
241
+ instructions = [:basic]
242
+ else
243
+ instructions.each do |instr|
244
+ unless ENCODE_ENTITIES_COMMAND_ORDER[instr]
245
+ raise RuntimeError, "unknown encode_entities command `#{instr.inspect}'"
246
+ end
247
+ end
248
+ instructions.sort! { |a,b|
249
+ ENCODE_ENTITIES_COMMAND_ORDER[a] <=>
250
+ ENCODE_ENTITIES_COMMAND_ORDER[b]
251
+ }
252
+ end
253
+ instructions.each do |instruction|
254
+ case instruction
255
+ when :basic
256
+ # Handled as basic ASCII
257
+ str = (str || self).gsub(HTMLEntities::BASIC_ENTITY_REGEXP) {
258
+ # It's safe to use the simpler [0] here because we know
259
+ # that the basic entities are ASCII.
260
+ '&' << HTMLEntities::REVERSE_MAP[$&[0]] << ';'
261
+ }
262
+ when :named
263
+ # Test everything except printable ASCII
264
+ str = (str || self).gsub(HTMLEntities::UTF8_NON_ASCII_REGEXP) {
265
+ cp = $&.unpack('U')[0]
266
+ (e = HTMLEntities::REVERSE_MAP[cp]) ? "&#{e};" : $&
267
+ }
268
+ when :decimal
269
+ str = (str || self).gsub(HTMLEntities::UTF8_NON_ASCII_REGEXP) {
270
+ "&##{$&.unpack('U')[0]};"
271
+ }
272
+ when :hexadecimal
273
+ str = (str || self).gsub(HTMLEntities::UTF8_NON_ASCII_REGEXP) {
274
+ "&#x#{$&.unpack('U')[0].to_s(16)};"
275
+ }
276
+ end
277
+ end
278
+ return str
279
+ end
280
+
281
+ end