rdig 0.3.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/doc/examples/config.rb +1 -1
- data/lib/rdig/content_extractors.rb +2 -0
- data/lib/rdig/crawler.rb +2 -0
- data/lib/rdig/documents.rb +2 -2
- data/lib/rdig/url_filters.rb +3 -3
- data/lib/rdig.rb +2 -1
- metadata +19 -20
data/doc/examples/config.rb
CHANGED
@@ -18,7 +18,7 @@ RDig.configuration do |cfg|
|
|
18
18
|
|
19
19
|
# this is the path where the index will be stored
|
20
20
|
# caution, existing contents of this directory will be deleted!
|
21
|
-
cfg.
|
21
|
+
cfg.index.path = '/path/to/index'
|
22
22
|
|
23
23
|
##################################################################
|
24
24
|
# options you might want to set, the given values are the defaults
|
@@ -255,6 +255,8 @@ module RDig
|
|
255
255
|
content << ' '
|
256
256
|
end
|
257
257
|
elsif element.string # it's a Tag, and it has some content string
|
258
|
+
# skip inline scripts and styles
|
259
|
+
return nil if element.name =~ /^(script|style)$/i
|
258
260
|
value = element.string.strip
|
259
261
|
unless value.empty?
|
260
262
|
content << value
|
data/lib/rdig/crawler.rb
CHANGED
data/lib/rdig/documents.rb
CHANGED
@@ -32,7 +32,7 @@ module RDig
|
|
32
32
|
begin
|
33
33
|
@uri = URI.parse(args[:url])
|
34
34
|
rescue URI::InvalidURIError
|
35
|
-
raise "Cannot create document using invalid URL: #{url}"
|
35
|
+
raise "Cannot create document using invalid URL: #{args[:url]}"
|
36
36
|
end
|
37
37
|
end
|
38
38
|
|
@@ -118,7 +118,7 @@ module RDig
|
|
118
118
|
@content = ContentExtractors.process(doc.read, doc.content_type)
|
119
119
|
@status = :success
|
120
120
|
when 404
|
121
|
-
puts "got 404 for #{
|
121
|
+
puts "got 404 for #{@uri}"
|
122
122
|
else
|
123
123
|
puts "don't know what to do with response: #{doc.status.join(' : ')}"
|
124
124
|
end
|
data/lib/rdig/url_filters.rb
CHANGED
@@ -89,11 +89,11 @@ module RDig
|
|
89
89
|
@patterns = []
|
90
90
|
if args.respond_to? :each
|
91
91
|
args.each { |pattern|
|
92
|
-
# cloning because unsure if regexps are thread safe
|
93
|
-
@patterns << pattern
|
92
|
+
# cloning because unsure if regexps are thread safe ?
|
93
|
+
@patterns << pattern #.clone
|
94
94
|
}
|
95
95
|
else
|
96
|
-
@patterns << args
|
96
|
+
@patterns << args #.clone
|
97
97
|
end
|
98
98
|
end
|
99
99
|
end
|
data/lib/rdig.rb
CHANGED
@@ -24,7 +24,7 @@
|
|
24
24
|
#++
|
25
25
|
#
|
26
26
|
|
27
|
-
RDIGVERSION = '0.3.
|
27
|
+
RDIGVERSION = '0.3.1'
|
28
28
|
|
29
29
|
|
30
30
|
require 'thread'
|
@@ -228,6 +228,7 @@ module RDig
|
|
228
228
|
|
229
229
|
# Run the +rdig+ application.
|
230
230
|
def run
|
231
|
+
puts "RDig version #{RDIGVERSION}"
|
231
232
|
handle_options
|
232
233
|
begin
|
233
234
|
load_configfile
|
metadata
CHANGED
@@ -1,10 +1,10 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
|
-
rubygems_version: 0.8.11
|
2
|
+
rubygems_version: 0.8.11
|
3
3
|
specification_version: 1
|
4
4
|
name: rdig
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.3.
|
7
|
-
date: 2006-
|
6
|
+
version: 0.3.1
|
7
|
+
date: 2006-07-26 00:00:00 +02:00
|
8
8
|
summary: Ruby based web site indexing and searching library.
|
9
9
|
require_paths:
|
10
10
|
- lib
|
@@ -25,50 +25,49 @@ required_ruby_version: !ruby/object:Gem::Version::Requirement
|
|
25
25
|
platform: ruby
|
26
26
|
signing_key:
|
27
27
|
cert_chain:
|
28
|
-
post_install_message:
|
29
28
|
authors:
|
30
29
|
- Jens Kraemer
|
31
30
|
files:
|
32
31
|
- bin/rdig
|
33
32
|
- lib/rdig
|
34
|
-
- lib/htmlentities
|
35
33
|
- lib/rdig.rb
|
36
|
-
- lib/
|
34
|
+
- lib/htmlentities
|
35
|
+
- lib/rdig/documents.rb
|
36
|
+
- lib/rdig/file.rb
|
37
|
+
- lib/rdig/content_extractors.rb
|
37
38
|
- lib/rdig/search.rb
|
38
39
|
- lib/rdig/highlight.rb
|
39
40
|
- lib/rdig/index.rb
|
40
41
|
- lib/rdig/url_filters.rb
|
41
|
-
- lib/rdig/
|
42
|
-
- lib/
|
43
|
-
- lib/
|
42
|
+
- lib/rdig/crawler.rb
|
43
|
+
- lib/htmlentities/htmlentities.rb
|
44
|
+
- lib/htmlentities/README
|
44
45
|
- lib/htmlentities/CHANGES
|
45
46
|
- lib/htmlentities/COPYING
|
46
|
-
- lib/htmlentities/README
|
47
|
-
- lib/htmlentities/htmlentities.rb
|
48
47
|
- test/unit
|
49
48
|
- test/fixtures
|
50
49
|
- test/test_helper.rb
|
51
|
-
- test/unit/etag_filter_test.rb
|
52
|
-
- test/unit/url_filters_test.rb
|
53
50
|
- test/unit/html_content_extractor_test.rb
|
54
|
-
- test/unit/
|
51
|
+
- test/unit/url_filters_test.rb
|
55
52
|
- test/unit/word_content_extractor_test.rb
|
56
|
-
- test/unit/file_document_test.rb
|
57
53
|
- test/unit/crawler_fs_test.rb
|
58
|
-
- test/
|
54
|
+
- test/unit/etag_filter_test.rb
|
55
|
+
- test/unit/pdf_content_extractor_test.rb
|
56
|
+
- test/unit/file_document_test.rb
|
59
57
|
- test/fixtures/pdf
|
58
|
+
- test/fixtures/html
|
60
59
|
- test/fixtures/word
|
60
|
+
- test/fixtures/pdf/simple.pdf
|
61
61
|
- test/fixtures/html/entities.html
|
62
|
-
- test/fixtures/html/simple.html
|
63
62
|
- test/fixtures/html/custom_tag_selectors.html
|
64
|
-
- test/fixtures/
|
63
|
+
- test/fixtures/html/simple.html
|
65
64
|
- test/fixtures/word/simple.doc
|
66
65
|
- doc/examples
|
67
66
|
- doc/examples/config.rb
|
68
|
-
- LICENSE
|
69
67
|
- TODO
|
70
|
-
-
|
68
|
+
- LICENSE
|
71
69
|
- README
|
70
|
+
- CHANGES
|
72
71
|
- install.rb
|
73
72
|
- rakefile
|
74
73
|
test_files: []
|