rdig 0.3.0 → 0.3.1
Sign up to get free protection for your applications and to get access to all the features.
- data/doc/examples/config.rb +1 -1
- data/lib/rdig/content_extractors.rb +2 -0
- data/lib/rdig/crawler.rb +2 -0
- data/lib/rdig/documents.rb +2 -2
- data/lib/rdig/url_filters.rb +3 -3
- data/lib/rdig.rb +2 -1
- metadata +19 -20
data/doc/examples/config.rb
CHANGED
@@ -18,7 +18,7 @@ RDig.configuration do |cfg|
|
|
18
18
|
|
19
19
|
# this is the path where the index will be stored
|
20
20
|
# caution, existing contents of this directory will be deleted!
|
21
|
-
cfg.
|
21
|
+
cfg.index.path = '/path/to/index'
|
22
22
|
|
23
23
|
##################################################################
|
24
24
|
# options you might want to set, the given values are the defaults
|
@@ -255,6 +255,8 @@ module RDig
|
|
255
255
|
content << ' '
|
256
256
|
end
|
257
257
|
elsif element.string # it's a Tag, and it has some content string
|
258
|
+
# skip inline scripts and styles
|
259
|
+
return nil if element.name =~ /^(script|style)$/i
|
258
260
|
value = element.string.strip
|
259
261
|
unless value.empty?
|
260
262
|
content << value
|
data/lib/rdig/crawler.rb
CHANGED
data/lib/rdig/documents.rb
CHANGED
@@ -32,7 +32,7 @@ module RDig
|
|
32
32
|
begin
|
33
33
|
@uri = URI.parse(args[:url])
|
34
34
|
rescue URI::InvalidURIError
|
35
|
-
raise "Cannot create document using invalid URL: #{url}"
|
35
|
+
raise "Cannot create document using invalid URL: #{args[:url]}"
|
36
36
|
end
|
37
37
|
end
|
38
38
|
|
@@ -118,7 +118,7 @@ module RDig
|
|
118
118
|
@content = ContentExtractors.process(doc.read, doc.content_type)
|
119
119
|
@status = :success
|
120
120
|
when 404
|
121
|
-
puts "got 404 for #{
|
121
|
+
puts "got 404 for #{@uri}"
|
122
122
|
else
|
123
123
|
puts "don't know what to do with response: #{doc.status.join(' : ')}"
|
124
124
|
end
|
data/lib/rdig/url_filters.rb
CHANGED
@@ -89,11 +89,11 @@ module RDig
|
|
89
89
|
@patterns = []
|
90
90
|
if args.respond_to? :each
|
91
91
|
args.each { |pattern|
|
92
|
-
# cloning because unsure if regexps are thread safe
|
93
|
-
@patterns << pattern
|
92
|
+
# cloning because unsure if regexps are thread safe ?
|
93
|
+
@patterns << pattern #.clone
|
94
94
|
}
|
95
95
|
else
|
96
|
-
@patterns << args
|
96
|
+
@patterns << args #.clone
|
97
97
|
end
|
98
98
|
end
|
99
99
|
end
|
data/lib/rdig.rb
CHANGED
@@ -24,7 +24,7 @@
|
|
24
24
|
#++
|
25
25
|
#
|
26
26
|
|
27
|
-
RDIGVERSION = '0.3.
|
27
|
+
RDIGVERSION = '0.3.1'
|
28
28
|
|
29
29
|
|
30
30
|
require 'thread'
|
@@ -228,6 +228,7 @@ module RDig
|
|
228
228
|
|
229
229
|
# Run the +rdig+ application.
|
230
230
|
def run
|
231
|
+
puts "RDig version #{RDIGVERSION}"
|
231
232
|
handle_options
|
232
233
|
begin
|
233
234
|
load_configfile
|
metadata
CHANGED
@@ -1,10 +1,10 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
|
-
rubygems_version: 0.8.11
|
2
|
+
rubygems_version: 0.8.11
|
3
3
|
specification_version: 1
|
4
4
|
name: rdig
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.3.
|
7
|
-
date: 2006-
|
6
|
+
version: 0.3.1
|
7
|
+
date: 2006-07-26 00:00:00 +02:00
|
8
8
|
summary: Ruby based web site indexing and searching library.
|
9
9
|
require_paths:
|
10
10
|
- lib
|
@@ -25,50 +25,49 @@ required_ruby_version: !ruby/object:Gem::Version::Requirement
|
|
25
25
|
platform: ruby
|
26
26
|
signing_key:
|
27
27
|
cert_chain:
|
28
|
-
post_install_message:
|
29
28
|
authors:
|
30
29
|
- Jens Kraemer
|
31
30
|
files:
|
32
31
|
- bin/rdig
|
33
32
|
- lib/rdig
|
34
|
-
- lib/htmlentities
|
35
33
|
- lib/rdig.rb
|
36
|
-
- lib/
|
34
|
+
- lib/htmlentities
|
35
|
+
- lib/rdig/documents.rb
|
36
|
+
- lib/rdig/file.rb
|
37
|
+
- lib/rdig/content_extractors.rb
|
37
38
|
- lib/rdig/search.rb
|
38
39
|
- lib/rdig/highlight.rb
|
39
40
|
- lib/rdig/index.rb
|
40
41
|
- lib/rdig/url_filters.rb
|
41
|
-
- lib/rdig/
|
42
|
-
- lib/
|
43
|
-
- lib/
|
42
|
+
- lib/rdig/crawler.rb
|
43
|
+
- lib/htmlentities/htmlentities.rb
|
44
|
+
- lib/htmlentities/README
|
44
45
|
- lib/htmlentities/CHANGES
|
45
46
|
- lib/htmlentities/COPYING
|
46
|
-
- lib/htmlentities/README
|
47
|
-
- lib/htmlentities/htmlentities.rb
|
48
47
|
- test/unit
|
49
48
|
- test/fixtures
|
50
49
|
- test/test_helper.rb
|
51
|
-
- test/unit/etag_filter_test.rb
|
52
|
-
- test/unit/url_filters_test.rb
|
53
50
|
- test/unit/html_content_extractor_test.rb
|
54
|
-
- test/unit/
|
51
|
+
- test/unit/url_filters_test.rb
|
55
52
|
- test/unit/word_content_extractor_test.rb
|
56
|
-
- test/unit/file_document_test.rb
|
57
53
|
- test/unit/crawler_fs_test.rb
|
58
|
-
- test/
|
54
|
+
- test/unit/etag_filter_test.rb
|
55
|
+
- test/unit/pdf_content_extractor_test.rb
|
56
|
+
- test/unit/file_document_test.rb
|
59
57
|
- test/fixtures/pdf
|
58
|
+
- test/fixtures/html
|
60
59
|
- test/fixtures/word
|
60
|
+
- test/fixtures/pdf/simple.pdf
|
61
61
|
- test/fixtures/html/entities.html
|
62
|
-
- test/fixtures/html/simple.html
|
63
62
|
- test/fixtures/html/custom_tag_selectors.html
|
64
|
-
- test/fixtures/
|
63
|
+
- test/fixtures/html/simple.html
|
65
64
|
- test/fixtures/word/simple.doc
|
66
65
|
- doc/examples
|
67
66
|
- doc/examples/config.rb
|
68
|
-
- LICENSE
|
69
67
|
- TODO
|
70
|
-
-
|
68
|
+
- LICENSE
|
71
69
|
- README
|
70
|
+
- CHANGES
|
72
71
|
- install.rb
|
73
72
|
- rakefile
|
74
73
|
test_files: []
|