rdig 0.2.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGES +14 -0
- data/doc/examples/config.rb +43 -7
- data/lib/rdig.rb +35 -20
- data/lib/rdig/content_extractors.rb +75 -37
- data/lib/rdig/crawler.rb +18 -91
- data/lib/rdig/documents.rb +133 -0
- data/lib/rdig/file.rb +18 -0
- data/lib/rdig/index.rb +6 -4
- data/lib/rdig/url_filters.rb +42 -9
- data/test/fixtures/pdf/simple.pdf +0 -0
- data/test/unit/crawler_fs_test.rb +32 -0
- data/test/unit/file_document_test.rb +34 -0
- data/test/unit/html_content_extractor_test.rb +14 -24
- data/test/unit/pdf_content_extractor_test.rb +3 -3
- data/test/unit/url_filters_test.rb +38 -38
- data/test/unit/word_content_extractor_test.rb +1 -1
- metadata +8 -4
- data/lib/rdig/http_client.rb +0 -22
data/CHANGES
CHANGED
@@ -1,3 +1,17 @@
|
|
1
|
+
0.3.0
|
2
|
+
- file system crawling
|
3
|
+
- optional url rewriting before indexing, e.g. for linking to results
|
4
|
+
via http and building the index directly from the file system
|
5
|
+
- PDF title extraction with pdfinfo
|
6
|
+
- removed dependency on mkmf which doesn't seem to exist in Ruby 1.8.2
|
7
|
+
- made content extractors more flexible - instances now use a given
|
8
|
+
configuration instead of the global one. This allows the
|
9
|
+
WordContentExtractor to use an HtmlContentExtractor with it's own
|
10
|
+
configuration that is independent of the global config.
|
11
|
+
|
12
|
+
0.2.1
|
13
|
+
- Bugfix release
|
14
|
+
|
1
15
|
0.2.0
|
2
16
|
- add pdf and Word content extraction capabilities using the tools
|
3
17
|
from the xpdf-utils and wv packages
|
data/doc/examples/config.rb
CHANGED
@@ -1,25 +1,36 @@
|
|
1
1
|
RDig.configuration do |cfg|
|
2
2
|
|
3
3
|
##################################################################
|
4
|
-
# options you should
|
4
|
+
# options you really should set
|
5
5
|
|
6
6
|
# provide one or more URLs for the crawler to start from
|
7
7
|
cfg.crawler.start_urls = [ 'http://www.example.com/' ]
|
8
8
|
|
9
|
+
# use something like this for crawling a file system:
|
10
|
+
# cfg.crawler.start_urls = [ 'file:///home/bob/documents/' ]
|
11
|
+
# beware, mixing file and http crawling is not possible and might result in
|
12
|
+
# unpredictable results.
|
13
|
+
|
9
14
|
# limit the crawl to these hosts. The crawler will never
|
10
15
|
# follow any links pointing to hosts other than those given here.
|
16
|
+
# ignored for file system crawling
|
11
17
|
cfg.crawler.include_hosts = [ 'www.example.com' ]
|
12
18
|
|
13
19
|
# this is the path where the index will be stored
|
14
20
|
# caution, existing contents of this directory will be deleted!
|
15
|
-
cfg.
|
21
|
+
cfg.indexer.path = '/path/to/index'
|
16
22
|
|
17
23
|
##################################################################
|
18
24
|
# options you might want to set, the given values are the defaults
|
25
|
+
|
26
|
+
# set to true to get stack traces on errors
|
27
|
+
# cfg.verbose = false
|
19
28
|
|
20
29
|
# content extraction options
|
21
30
|
|
22
|
-
# provide a method that
|
31
|
+
# provide a method that returns the title of an html document
|
32
|
+
# this method may either return a tag to extract the title from,
|
33
|
+
# or a ready-to-index string.
|
23
34
|
# cfg.content_extraction.html.title_tag_selector = lambda { |tagsoup| tagsoup.html.head.title }
|
24
35
|
|
25
36
|
# provide a method that selects the tag containing the page content you
|
@@ -29,8 +40,12 @@ RDig.configuration do |cfg|
|
|
29
40
|
|
30
41
|
# crawler options
|
31
42
|
|
32
|
-
#
|
33
|
-
#
|
43
|
+
# Notice: for file system crawling the include/exclude_document patterns are
|
44
|
+
# applied to the full path of _files_ only (like /home/bob/test.pdf),
|
45
|
+
# for http to full URIs (like http://example.com/index.html).
|
46
|
+
|
47
|
+
# nil (include all documents) or an array of Regexps
|
48
|
+
# matching the URLs you want to index.
|
34
49
|
# cfg.crawler.include_documents = nil
|
35
50
|
|
36
51
|
# nil (no documents excluded) or an array of Regexps
|
@@ -40,14 +55,35 @@ RDig.configuration do |cfg|
|
|
40
55
|
# included by the inclusion patterns.
|
41
56
|
# cfg.crawler.exclude_documents = nil
|
42
57
|
|
43
|
-
# number of
|
58
|
+
# number of document fetching threads to use. Should be raised only if
|
59
|
+
# your CPU has idle time when indexing.
|
44
60
|
# cfg.crawler.num_threads = 2
|
61
|
+
# suggested setting for file system crawling:
|
62
|
+
# cfg.crawler.num_threads = 1
|
45
63
|
|
46
64
|
# maximum number of http redirections to follow
|
47
65
|
# cfg.crawler.max_redirects = 5
|
48
66
|
|
49
67
|
# number of seconds to wait with an empty url queue before
|
50
|
-
# finishing the crawl. Set to a higher number
|
68
|
+
# finishing the crawl. Set to a higher number when experiencing incomplete
|
69
|
+
# crawls on slow sites. Don't set to 0, even when crawling a local fs.
|
51
70
|
# cfg.crawler.wait_before_leave = 10
|
71
|
+
|
72
|
+
# indexer options
|
73
|
+
|
74
|
+
# create a new index on each run. Will append to the index if false. Use when
|
75
|
+
# building a single index from multiple runs, e.g. one across a website and the
|
76
|
+
# other a tree in a local file system
|
77
|
+
# config.index.create = true
|
78
|
+
|
79
|
+
# rewrite document uris before indexing them. This is useful if you're
|
80
|
+
# indexing on disk, but the documents should be accessible via http, e.g. from
|
81
|
+
# a web based search application. By default, no rewriting takes place.
|
82
|
+
# example:
|
83
|
+
# cfg.index.rewrite_uri = lambda { |uri|
|
84
|
+
# uri.path.gsub!(/^\/base\//, '/virtual_dir/')
|
85
|
+
# uri.scheme = 'http'
|
86
|
+
# uri.host = 'www.mydomain.com'
|
87
|
+
# }
|
52
88
|
|
53
89
|
end
|
data/lib/rdig.rb
CHANGED
@@ -24,7 +24,7 @@
|
|
24
24
|
#++
|
25
25
|
#
|
26
26
|
|
27
|
-
RDIGVERSION = '0.
|
27
|
+
RDIGVERSION = '0.3.0'
|
28
28
|
|
29
29
|
|
30
30
|
require 'thread'
|
@@ -38,28 +38,28 @@ require 'set'
|
|
38
38
|
require 'net/http'
|
39
39
|
require 'getoptlong'
|
40
40
|
require 'tempfile'
|
41
|
-
|
42
|
-
# programs:
|
43
|
-
require 'mkmf'
|
41
|
+
require 'open-uri'
|
44
42
|
|
45
43
|
begin
|
46
|
-
require 'rubyful_soup'
|
47
44
|
require 'ferret'
|
45
|
+
require 'rubyful_soup'
|
48
46
|
rescue LoadError
|
49
47
|
require 'rubygems'
|
50
|
-
require 'rubyful_soup'
|
51
48
|
require 'ferret'
|
49
|
+
require 'rubyful_soup'
|
52
50
|
end
|
53
51
|
|
54
52
|
require 'htmlentities/htmlentities'
|
55
|
-
|
56
|
-
require 'rdig/http_client'
|
53
|
+
|
57
54
|
require 'rdig/content_extractors'
|
58
55
|
require 'rdig/url_filters'
|
59
56
|
require 'rdig/search'
|
60
57
|
require 'rdig/index'
|
58
|
+
require 'rdig/file'
|
59
|
+
require 'rdig/documents'
|
61
60
|
require 'rdig/crawler'
|
62
61
|
|
62
|
+
|
63
63
|
$KCODE = 'u'
|
64
64
|
require 'jcode'
|
65
65
|
|
@@ -68,17 +68,30 @@ module RDig
|
|
68
68
|
|
69
69
|
class << self
|
70
70
|
|
71
|
-
# the filter
|
71
|
+
# the filter chains are for limiting the set of indexed documents.
|
72
|
+
# there are two chain types - one for http, and one for file system
|
73
|
+
# crawling.
|
74
|
+
# a document has to survive all filters in the chain to get indexed.
|
72
75
|
def filter_chain
|
73
|
-
@filter_chain ||=
|
74
|
-
|
75
|
-
:
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
76
|
+
@filter_chain ||= {
|
77
|
+
# filter chain for http crawling
|
78
|
+
:http => [
|
79
|
+
:scheme_filter_http,
|
80
|
+
:fix_relative_uri,
|
81
|
+
:normalize_uri,
|
82
|
+
{ :hostname_filter => :include_hosts },
|
83
|
+
{ RDig::UrlFilters::UrlInclusionFilter => :include_documents },
|
84
|
+
{ RDig::UrlFilters::UrlExclusionFilter => :exclude_documents },
|
85
|
+
RDig::UrlFilters::VisitedUrlFilter
|
86
|
+
],
|
87
|
+
# filter chain for file system crawling
|
88
|
+
:file => [
|
89
|
+
:scheme_filter_file,
|
90
|
+
{ RDig::UrlFilters::PathInclusionFilter => :include_documents },
|
91
|
+
{ RDig::UrlFilters::PathExclusionFilter => :exclude_documents }
|
92
|
+
]
|
93
|
+
}
|
94
|
+
|
82
95
|
end
|
83
96
|
|
84
97
|
def application
|
@@ -86,7 +99,7 @@ module RDig
|
|
86
99
|
end
|
87
100
|
|
88
101
|
def searcher
|
89
|
-
@searcher ||= Search::Searcher.new(config.
|
102
|
+
@searcher ||= Search::Searcher.new(config.index)
|
90
103
|
end
|
91
104
|
|
92
105
|
# RDig configuration
|
@@ -124,7 +137,7 @@ module RDig
|
|
124
137
|
}
|
125
138
|
)
|
126
139
|
),
|
127
|
-
:
|
140
|
+
:index => OpenStruct.new(
|
128
141
|
:path => "index/",
|
129
142
|
:create => true,
|
130
143
|
:handle_parse_errors => true,
|
@@ -224,6 +237,8 @@ module RDig
|
|
224
237
|
|
225
238
|
end
|
226
239
|
|
240
|
+
puts "using Ferret #{Ferret::VERSION}"
|
241
|
+
|
227
242
|
if options.query
|
228
243
|
# query the index
|
229
244
|
puts "executing query >#{options.query}<"
|
@@ -54,7 +54,9 @@ module RDig
|
|
54
54
|
|
55
55
|
def self.extractors; @@extractors ||= [] end
|
56
56
|
def self.extractor_instances
|
57
|
-
@@extractor_instances ||= extractors.map { |ex_class|
|
57
|
+
@@extractor_instances ||= extractors.map { |ex_class|
|
58
|
+
ex_class.new(RDig.configuration.content_extraction)
|
59
|
+
}
|
58
60
|
end
|
59
61
|
|
60
62
|
def self.process(content, content_type)
|
@@ -65,6 +67,10 @@ module RDig
|
|
65
67
|
nil
|
66
68
|
end
|
67
69
|
|
70
|
+
def initialize(config)
|
71
|
+
@config = config
|
72
|
+
end
|
73
|
+
|
68
74
|
def can_do(content_type)
|
69
75
|
content_type =~ @pattern
|
70
76
|
end
|
@@ -91,60 +97,88 @@ module RDig
|
|
91
97
|
file.delete
|
92
98
|
end
|
93
99
|
|
94
|
-
|
95
|
-
|
96
|
-
@available = !find_executable(@executable).nil?
|
97
|
-
end
|
98
|
-
@available
|
99
|
-
end
|
100
|
-
|
100
|
+
# setting @available according to presence of external executables
|
101
|
+
# in initializer of ContentExtractor is needed to make this work
|
101
102
|
def can_do(content_type)
|
102
|
-
available and super(content_type)
|
103
|
+
@available and super(content_type)
|
103
104
|
end
|
104
105
|
end
|
105
106
|
|
106
107
|
# Extract text from pdf content.
|
107
108
|
#
|
108
|
-
# Requires the pdftotext
|
109
|
+
# Requires the pdftotext and pdfinfo utilities from the
|
110
|
+
# xpdf-utils package
|
109
111
|
# (on debian and friends do 'apt-get install xpdf-utils')
|
110
112
|
#
|
111
|
-
# TODO: use pdfinfo to get title from document
|
112
113
|
class PdfContentExtractor < ContentExtractor
|
113
114
|
include ExternalAppHelper
|
114
115
|
|
115
|
-
def initialize
|
116
|
-
|
116
|
+
def initialize(config)
|
117
|
+
super(config)
|
117
118
|
@pattern = /^application\/pdf/
|
119
|
+
@pdftotext = 'pdftotext'
|
120
|
+
@pdfinfo = 'pdfinfo'
|
121
|
+
@available = true
|
122
|
+
[ @pdftotext, @pdfinfo].each { |program|
|
123
|
+
unless %x{#{program} -h 2>&1} =~ /Copyright 1996/
|
124
|
+
@available = false
|
125
|
+
break
|
126
|
+
end
|
127
|
+
}
|
118
128
|
end
|
119
|
-
|
129
|
+
|
130
|
+
def process(content)
|
131
|
+
result = {}
|
132
|
+
as_file(content) do |file|
|
133
|
+
result[:content] = get_content(file.path).strip
|
134
|
+
result[:title] = get_title(file.path)
|
135
|
+
end
|
136
|
+
result
|
137
|
+
end
|
138
|
+
|
120
139
|
def get_content(path_to_tempfile)
|
121
|
-
%x{#{@
|
140
|
+
%x{#{@pdftotext} -enc UTF-8 '#{path_to_tempfile}' -}
|
141
|
+
end
|
142
|
+
|
143
|
+
# extracts the title from pdf meta data
|
144
|
+
# needs pdfinfo
|
145
|
+
# returns the title or nil if no title was found
|
146
|
+
def get_title(path_to_tempfile)
|
147
|
+
%x{#{@pdfinfo} -enc UTF-8 '#{path_to_tempfile}'} =~ /title:\s+(.*)$/i ? $1.strip : nil
|
148
|
+
rescue
|
122
149
|
end
|
123
150
|
end
|
124
151
|
|
125
152
|
# Extract text from word documents
|
126
153
|
#
|
127
|
-
# Requires the
|
128
|
-
# (on debian and friends do 'apt-get install
|
154
|
+
# Requires the wvHtml utility
|
155
|
+
# (on debian and friends do 'apt-get install wv')
|
129
156
|
class WordContentExtractor < ContentExtractor
|
130
157
|
include ExternalAppHelper
|
131
158
|
|
132
|
-
def initialize
|
133
|
-
|
159
|
+
def initialize(config)
|
160
|
+
super(config)
|
161
|
+
@wvhtml = 'wvHtml'
|
134
162
|
@pattern = /^application\/msword/
|
135
|
-
|
163
|
+
# html extractor for parsing wvHtml output
|
164
|
+
@html_extractor = HtmlContentExtractor.new(OpenStruct.new(
|
165
|
+
:html => OpenStruct.new(
|
166
|
+
:content_tag_selector => lambda { |tagsoup|
|
167
|
+
tagsoup.html.body
|
168
|
+
},
|
169
|
+
:title_tag_selector => lambda { |tagsoup|
|
170
|
+
tagsoup.html.head.title
|
171
|
+
}
|
172
|
+
)))
|
173
|
+
|
174
|
+
# TODO: besser: if $?.exitstatus == 127 (not found)
|
175
|
+
@available = %x{#{@wvhtml} -h 2>&1} =~ /Dom Lachowicz/
|
136
176
|
end
|
137
177
|
|
138
178
|
def process(content)
|
139
179
|
result = {}
|
140
|
-
as_file(content) do |
|
141
|
-
|
142
|
-
outfile.close
|
143
|
-
%x{#{@executable} --targetdir='#{File.dirname(outfile.path)}' '#{infile.path}' '#{File.basename(outfile.path)}'}
|
144
|
-
File.open(outfile.path) do |html|
|
145
|
-
result = @html_extractor.process(html.read)
|
146
|
-
end
|
147
|
-
outfile.delete
|
180
|
+
as_file(content) do |file|
|
181
|
+
result = @html_extractor.process(%x{#{@wvhtml} --charset=UTF-8 '#{file.path}' -})
|
148
182
|
end
|
149
183
|
return result || {}
|
150
184
|
end
|
@@ -154,7 +188,8 @@ module RDig
|
|
154
188
|
# extracts title, content and links from html documents
|
155
189
|
class HtmlContentExtractor < ContentExtractor
|
156
190
|
|
157
|
-
def initialize
|
191
|
+
def initialize(config)
|
192
|
+
super(config)
|
158
193
|
@pattern = /^(text\/(html|xml)|application\/(xhtml\+xml|xml))/
|
159
194
|
end
|
160
195
|
|
@@ -181,9 +216,10 @@ module RDig
|
|
181
216
|
# children.
|
182
217
|
def extract_content(tag_soup)
|
183
218
|
content = ''
|
184
|
-
content_element(tag_soup)
|
219
|
+
ce = content_element(tag_soup)
|
220
|
+
ce.children { |child|
|
185
221
|
extract_text(child, content)
|
186
|
-
}
|
222
|
+
} unless ce.nil?
|
187
223
|
return content.strip
|
188
224
|
end
|
189
225
|
|
@@ -197,18 +233,20 @@ module RDig
|
|
197
233
|
|
198
234
|
# Extracts the title from the given html tree
|
199
235
|
def extract_title(tagsoup)
|
200
|
-
title = ''
|
201
236
|
the_title_tag = title_tag(tagsoup)
|
202
237
|
if the_title_tag.is_a? String
|
203
238
|
the_title_tag
|
204
239
|
else
|
205
|
-
|
240
|
+
title = ''
|
241
|
+
extract_text(the_title_tag, title)
|
242
|
+
title.strip
|
206
243
|
end
|
207
244
|
end
|
208
245
|
|
209
246
|
# Recursively extracts all text contained in the given element,
|
210
247
|
# and appends it to content.
|
211
248
|
def extract_text(element, content='')
|
249
|
+
return nil if element.nil?
|
212
250
|
if element.is_a? NavigableString
|
213
251
|
value = strip_comments(element)
|
214
252
|
value.strip!
|
@@ -234,8 +272,8 @@ module RDig
|
|
234
272
|
# This may return a string, e.g. an attribute value selected from a meta
|
235
273
|
# tag, too.
|
236
274
|
def title_tag(tagsoup)
|
237
|
-
if
|
238
|
-
|
275
|
+
if @config.html.title_tag_selector
|
276
|
+
@config.html.title_tag_selector.call(tagsoup)
|
239
277
|
else
|
240
278
|
tagsoup.html.head.title
|
241
279
|
end
|
@@ -243,8 +281,8 @@ module RDig
|
|
243
281
|
|
244
282
|
# Retrieve the root element to extract document content from
|
245
283
|
def content_element(tagsoup)
|
246
|
-
if
|
247
|
-
|
284
|
+
if @config.html.content_tag_selector
|
285
|
+
@config.html.content_tag_selector.call(tagsoup)
|
248
286
|
else
|
249
287
|
tagsoup.html.body
|
250
288
|
end
|
data/lib/rdig/crawler.rb
CHANGED
@@ -9,30 +9,28 @@ module RDig
|
|
9
9
|
end
|
10
10
|
|
11
11
|
def run
|
12
|
-
|
13
|
-
|
12
|
+
raise 'no start urls given!' if RDig.config.crawler.start_urls.empty?
|
13
|
+
@indexer = Index::Indexer.new(RDig.config.index)
|
14
|
+
|
15
|
+
# check whether we are indexing on-disk or via http
|
16
|
+
url_type = RDig.config.crawler.start_urls.first =~ /^file:\/\// ? :file : :http
|
17
|
+
chain_config = RDig.filter_chain[url_type]
|
18
|
+
|
19
|
+
filterchain = UrlFilters::FilterChain.new(chain_config)
|
14
20
|
RDig.config.crawler.start_urls.each { |url| add_url(url, filterchain) }
|
15
21
|
|
16
22
|
num_threads = RDig.config.crawler.num_threads
|
17
23
|
group = ThreadsWait.new
|
18
24
|
num_threads.times { |i|
|
19
25
|
group.join_nowait Thread.new("fetcher #{i}") {
|
20
|
-
filterchain = UrlFilters::FilterChain.new(
|
26
|
+
filterchain = UrlFilters::FilterChain.new(chain_config)
|
21
27
|
while (doc = @documents.pop) != :exit
|
22
28
|
process_document doc, filterchain
|
23
29
|
end
|
24
30
|
}
|
25
31
|
}
|
26
32
|
|
27
|
-
#
|
28
|
-
# t1 pops the start url from the queue which now is empty
|
29
|
-
# as the queue is empty now, t2 blocks until t1 adds the links
|
30
|
-
# retrieved from his document.
|
31
|
-
#
|
32
|
-
# But we need the 'queue empty' condition as a sign for us to stop
|
33
|
-
# waiting for new entries, too.
|
34
|
-
|
35
|
-
# check every now and then for an empty queue
|
33
|
+
# check for an empty queue every now and then
|
36
34
|
sleep_interval = RDig.config.crawler.wait_before_leave
|
37
35
|
begin
|
38
36
|
sleep sleep_interval
|
@@ -54,22 +52,10 @@ module RDig
|
|
54
52
|
} unless doc.content[:links].nil?
|
55
53
|
|
56
54
|
return unless @etag_filter.apply(doc)
|
57
|
-
|
58
|
-
when :success
|
59
|
-
if doc.content
|
60
|
-
if doc.content[:links]
|
61
|
-
doc.content[:links].each { |url| add_url(url, filterchain, doc) }
|
62
|
-
end
|
63
|
-
@indexer << doc
|
64
|
-
#else
|
65
|
-
#puts "success but no content: #{doc.uri.to_s}"
|
66
|
-
end
|
67
|
-
when :redirect
|
68
|
-
# links contains the url we were redirected to
|
69
|
-
doc.content[:links].each { |url| add_url(url, filterchain, doc) }
|
70
|
-
end
|
55
|
+
@indexer << doc if doc.needs_indexing?
|
71
56
|
rescue
|
72
57
|
puts "error processing document #{doc.uri.to_s}: #{$!}"
|
58
|
+
puts "Trace: #{$!.backtrace.join("\n")}" if RDig::config.verbose
|
73
59
|
end
|
74
60
|
|
75
61
|
|
@@ -78,82 +64,23 @@ module RDig
|
|
78
64
|
# processing
|
79
65
|
def add_url(url, filterchain, referring_document = nil)
|
80
66
|
return if url.nil? || url.empty?
|
81
|
-
if referring_document
|
82
|
-
doc = Document.
|
83
|
-
# keep redirect count
|
84
|
-
if referring_document.status == :redirect
|
85
|
-
doc.redirections = referring_document.redirections + 1
|
86
|
-
end
|
67
|
+
if referring_document and referring_document.uri.scheme =~ /^https?/i
|
68
|
+
doc = Document.create(url, referring_document.uri)
|
87
69
|
else
|
88
|
-
doc = Document.
|
70
|
+
doc = Document.create(url)
|
89
71
|
end
|
90
72
|
|
91
73
|
doc = filterchain.apply(doc)
|
92
74
|
|
93
75
|
if doc
|
94
|
-
|
95
|
-
#
|
96
|
-
#puts "skipping url #{url}"
|
76
|
+
@documents << doc
|
77
|
+
puts "added url #{url}" if RDig::config.verbose
|
97
78
|
end
|
98
|
-
@documents << doc if doc
|
99
79
|
end
|
100
80
|
|
101
81
|
end
|
102
82
|
|
103
83
|
|
104
|
-
class Document
|
105
|
-
include HttpClient
|
106
|
-
|
107
|
-
attr_reader :content
|
108
|
-
attr_reader :content_type
|
109
|
-
attr_reader :uri
|
110
|
-
attr_reader :referring_uri
|
111
|
-
attr_reader :status
|
112
|
-
attr_reader :etag
|
113
|
-
attr_accessor :redirections
|
114
|
-
|
115
|
-
# url: url of this document, may be relative to the referring doc or host.
|
116
|
-
# referrer: uri of the document we retrieved this link from
|
117
|
-
def initialize(url, referrer = nil)
|
118
|
-
@redirections = 0
|
119
|
-
begin
|
120
|
-
@uri = URI.parse(url)
|
121
|
-
rescue URI::InvalidURIError
|
122
|
-
raise "Cannot create document using invalid URL: #{url}"
|
123
|
-
end
|
124
|
-
@referring_uri = referrer
|
125
|
-
end
|
126
|
-
|
127
|
-
def has_content?
|
128
|
-
!self.content.nil?
|
129
|
-
end
|
130
|
-
|
131
|
-
def title; @content[:title] end
|
132
|
-
def body; @content[:content] end
|
133
|
-
def url; @uri.to_s end
|
134
|
-
|
135
|
-
def fetch
|
136
|
-
puts "fetching #{@uri.to_s}"
|
137
|
-
response = do_get(@uri)
|
138
|
-
case response
|
139
|
-
when Net::HTTPSuccess
|
140
|
-
@content_type = response['content-type']
|
141
|
-
@raw_body = response.body
|
142
|
-
@etag = response['etag']
|
143
|
-
# todo externalize this (another chain ?)
|
144
|
-
@content = ContentExtractors.process(@raw_body, @content_type)
|
145
|
-
@status = :success
|
146
|
-
when Net::HTTPRedirection
|
147
|
-
@status = :redirect
|
148
|
-
@content = { :links => [ response['location'] ] }
|
149
|
-
else
|
150
|
-
puts "don't know what to do with response: #{response}"
|
151
|
-
end
|
152
|
-
|
153
|
-
end
|
154
|
-
|
155
|
-
end
|
156
|
-
|
157
84
|
# checks fetched documents' E-Tag headers against the list of E-Tags
|
158
85
|
# of the documents already indexed.
|
159
86
|
# This is supposed to help against double-indexing documents which can
|
@@ -169,7 +96,7 @@ module RDig
|
|
169
96
|
end
|
170
97
|
|
171
98
|
def apply(document)
|
172
|
-
return document unless document.etag
|
99
|
+
return document unless (document.respond_to?(:etag) && document.etag)
|
173
100
|
synchronize do
|
174
101
|
@etags.add?(document.etag) ? document : nil
|
175
102
|
end
|
@@ -0,0 +1,133 @@
|
|
1
|
+
module RDig
|
2
|
+
|
3
|
+
#
|
4
|
+
# Document base class
|
5
|
+
#
|
6
|
+
class Document
|
7
|
+
|
8
|
+
attr_reader :uri
|
9
|
+
attr_reader :content
|
10
|
+
attr_reader :content_type
|
11
|
+
|
12
|
+
def self.create(url, referrer_uri = nil)
|
13
|
+
# a referrer is a clear enough hint to create an HttpDocument
|
14
|
+
if referrer_uri && referrer_uri.scheme =~ /^https?$/i
|
15
|
+
return HttpDocument.new(:url => url, :referrer => referrer_uri)
|
16
|
+
end
|
17
|
+
|
18
|
+
case url
|
19
|
+
when /^https?:\/\//i
|
20
|
+
HttpDocument.new(:url => url, :referrer => referrer_uri) if referrer_uri.nil?
|
21
|
+
when /^file:\/\//i
|
22
|
+
# files don't have referrers - the check for nil prevents us from being
|
23
|
+
# tricked into indexing local files by file:// links in the web site
|
24
|
+
# we index.
|
25
|
+
FileDocument.new(:url => url) if referrer_uri.nil?
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
# url: url of this document, may be relative to the referring doc or host.
|
30
|
+
# referrer: uri of the document we retrieved this link from
|
31
|
+
def initialize(args)
|
32
|
+
begin
|
33
|
+
@uri = URI.parse(args[:url])
|
34
|
+
rescue URI::InvalidURIError
|
35
|
+
raise "Cannot create document using invalid URL: #{url}"
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def title; @content[:title] end
|
40
|
+
def body; @content[:content] end
|
41
|
+
def links; @content[:links] end
|
42
|
+
|
43
|
+
def needs_indexing?
|
44
|
+
has_content? && (title || body)
|
45
|
+
end
|
46
|
+
|
47
|
+
def has_content?
|
48
|
+
!self.content.nil?
|
49
|
+
end
|
50
|
+
|
51
|
+
end
|
52
|
+
|
53
|
+
|
54
|
+
#
|
55
|
+
# Document in a File system
|
56
|
+
#
|
57
|
+
class FileDocument < Document
|
58
|
+
def initialize(args={})
|
59
|
+
super(args)
|
60
|
+
end
|
61
|
+
|
62
|
+
def self.find_files(path)
|
63
|
+
links = []
|
64
|
+
Dir.glob(File.expand_path(File.join(path, '*'))) do |filename|
|
65
|
+
# Skip files not matching known mime types
|
66
|
+
pattern = /.+\.(#{File::FILE_EXTENSION_MIME_TYPES.keys.join('|')})$/i
|
67
|
+
if File.directory?(filename) || filename =~ pattern
|
68
|
+
links << "file://#{filename}"
|
69
|
+
end
|
70
|
+
end
|
71
|
+
links
|
72
|
+
end
|
73
|
+
|
74
|
+
def file?
|
75
|
+
File.file? @uri.path
|
76
|
+
end
|
77
|
+
|
78
|
+
def fetch
|
79
|
+
if File.directory? @uri.path
|
80
|
+
# directories are treated like a link collection
|
81
|
+
@content = { :links => self.class.find_files(@uri.path) }
|
82
|
+
else
|
83
|
+
# process this file's contents
|
84
|
+
open(@uri.path) do |file|
|
85
|
+
@content = ContentExtractors.process(file.read, file.content_type)
|
86
|
+
@content[:links] = nil if @content # don't follow links inside files
|
87
|
+
end
|
88
|
+
end
|
89
|
+
@content ||= {}
|
90
|
+
end
|
91
|
+
|
92
|
+
end
|
93
|
+
|
94
|
+
|
95
|
+
#
|
96
|
+
# Remote Document to be retrieved by HTTP
|
97
|
+
#
|
98
|
+
class HttpDocument < Document
|
99
|
+
|
100
|
+
attr_reader :referring_uri
|
101
|
+
attr_reader :status
|
102
|
+
attr_reader :etag
|
103
|
+
|
104
|
+
# url: url of this document, may be relative to the referring doc or host.
|
105
|
+
# referrer: uri of the document we retrieved this link from
|
106
|
+
def initialize(args={})
|
107
|
+
super(args)
|
108
|
+
@referring_uri = args[:referrer]
|
109
|
+
end
|
110
|
+
|
111
|
+
def fetch
|
112
|
+
puts "fetching #{@uri.to_s}" if RDig::config.verbose
|
113
|
+
open(@uri.to_s) do |doc|
|
114
|
+
case doc.status.first.to_i
|
115
|
+
when 200
|
116
|
+
@etag = doc.meta['etag']
|
117
|
+
# puts "etag: #{@etag}"
|
118
|
+
@content = ContentExtractors.process(doc.read, doc.content_type)
|
119
|
+
@status = :success
|
120
|
+
when 404
|
121
|
+
puts "got 404 for #{url}"
|
122
|
+
else
|
123
|
+
puts "don't know what to do with response: #{doc.status.join(' : ')}"
|
124
|
+
end
|
125
|
+
end
|
126
|
+
rescue
|
127
|
+
puts "error fetching #{@uri.to_s}: #{$!}" if RDig::config.verbose
|
128
|
+
ensure
|
129
|
+
@content ||= {}
|
130
|
+
end
|
131
|
+
|
132
|
+
end
|
133
|
+
end
|
data/lib/rdig/file.rb
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
# Extend class File with a content_type method
|
2
|
+
class File
|
3
|
+
|
4
|
+
# mime types and file extensions
|
5
|
+
FILE_EXTENSION_MIME_TYPES = {
|
6
|
+
'doc' => 'application/msword',
|
7
|
+
'html' => 'text/html',
|
8
|
+
'htm' => 'text/html',
|
9
|
+
#'.odt' => 'application/vnd.oasis.opendocument.text',
|
10
|
+
'pdf' => 'application/pdf',
|
11
|
+
'txt' => 'text/plain',
|
12
|
+
}
|
13
|
+
|
14
|
+
def content_type
|
15
|
+
FILE_EXTENSION_MIME_TYPES[File.extname(self.path).downcase.gsub(/^\./,'')] || 'application/octet-stream'
|
16
|
+
end
|
17
|
+
|
18
|
+
end
|
data/lib/rdig/index.rb
CHANGED
@@ -6,7 +6,7 @@ module RDig
|
|
6
6
|
include MonitorMixin, Ferret::Index, Ferret::Document
|
7
7
|
|
8
8
|
def initialize(settings)
|
9
|
-
|
9
|
+
@config = settings
|
10
10
|
@index_writer = IndexWriter.new(settings.path,
|
11
11
|
:create => settings.create,
|
12
12
|
:analyzer => settings.analyzer)
|
@@ -14,10 +14,12 @@ module RDig
|
|
14
14
|
end
|
15
15
|
|
16
16
|
def add_to_index(document)
|
17
|
-
puts "add to index: #{document.uri.to_s}"
|
17
|
+
puts "add to index: #{document.uri.to_s}" if RDig::config.verbose
|
18
18
|
doc = Ferret::Document::Document.new
|
19
|
-
|
20
|
-
|
19
|
+
@config.rewrite_uri.call(document.uri) if @config.rewrite_uri
|
20
|
+
|
21
|
+
doc << Field.new("url", document.uri.to_s,
|
22
|
+
Field::Store::YES, Field::Index::TOKENIZED)
|
21
23
|
doc << Field.new("title", document.title,
|
22
24
|
Field::Store::YES, Field::Index::TOKENIZED)
|
23
25
|
doc << Field.new("data", document.body,
|
data/lib/rdig/url_filters.rb
CHANGED
@@ -82,7 +82,7 @@ module RDig
|
|
82
82
|
|
83
83
|
|
84
84
|
# base class for url inclusion / exclusion filters
|
85
|
-
class
|
85
|
+
class PatternFilter
|
86
86
|
# takes an Array of Regexps, or nil to disable the filter
|
87
87
|
def initialize(args=nil)
|
88
88
|
unless args.nil?
|
@@ -98,8 +98,8 @@ module RDig
|
|
98
98
|
end
|
99
99
|
end
|
100
100
|
end
|
101
|
-
class UrlExclusionFilter <
|
102
|
-
# returns nil if any of the patterns matches it's
|
101
|
+
class UrlExclusionFilter < PatternFilter
|
102
|
+
# returns nil if any of the patterns matches it's URI,
|
103
103
|
# the document itself otherwise
|
104
104
|
def apply(document)
|
105
105
|
return document unless @patterns
|
@@ -109,9 +109,9 @@ module RDig
|
|
109
109
|
return document
|
110
110
|
end
|
111
111
|
end
|
112
|
-
class UrlInclusionFilter <
|
113
|
-
# returns
|
114
|
-
#
|
112
|
+
class UrlInclusionFilter < PatternFilter
|
113
|
+
# returns the document if any of the patterns matches it's URI,
|
114
|
+
# nil otherwise
|
115
115
|
def apply(document)
|
116
116
|
return document unless @patterns
|
117
117
|
@patterns.each { |p|
|
@@ -121,21 +121,42 @@ module RDig
|
|
121
121
|
end
|
122
122
|
end
|
123
123
|
|
124
|
-
|
124
|
+
# returns nil if any of the patterns matches it's path,
|
125
|
+
# the document itself otherwise. Applied to real files only.
|
126
|
+
class PathExclusionFilter < PatternFilter
|
127
|
+
def apply(document)
|
128
|
+
return document unless (@patterns && document.file?)
|
129
|
+
@patterns.each { |p|
|
130
|
+
return nil if document.uri.path =~ p
|
131
|
+
}
|
132
|
+
return document
|
133
|
+
end
|
134
|
+
end
|
135
|
+
# returns the document if any of the patterns matches it's path,
|
136
|
+
# nil otherwise. Applied to real files only
|
137
|
+
class PathInclusionFilter < PatternFilter
|
138
|
+
def apply(document)
|
139
|
+
return document unless (@patterns && document.file?)
|
140
|
+
@patterns.each { |p|
|
141
|
+
return document if document.uri.path =~ p
|
142
|
+
}
|
143
|
+
return nil
|
144
|
+
end
|
145
|
+
end
|
125
146
|
|
126
147
|
|
127
148
|
# checks redirect count of the given document
|
128
149
|
# takes it out of the chain if number of redirections exceeds the
|
129
150
|
# max_redirects setting
|
130
151
|
def UrlFilters.maximum_redirect_filter(document, max_redirects)
|
131
|
-
return nil if document.redirections > max_redirects
|
152
|
+
return nil if document.respond_to?(:redirections) && document.redirections > max_redirects
|
132
153
|
return document
|
133
154
|
end
|
134
155
|
|
135
156
|
# expands both href="/path/xyz.html" and href="affe.html"
|
136
157
|
# to full urls
|
137
158
|
def UrlFilters.fix_relative_uri(document)
|
138
|
-
return nil unless document.uri.scheme.nil? || document.uri.scheme =~ /^
|
159
|
+
#return nil unless document.uri.scheme.nil? || document.uri.scheme =~ /^https?/i
|
139
160
|
ref = document.referring_uri
|
140
161
|
return document unless ref
|
141
162
|
uri = document.uri
|
@@ -150,6 +171,9 @@ module RDig
|
|
150
171
|
uri.path = ref_path[0..ref_path.rindex('/')] + uri.path
|
151
172
|
end
|
152
173
|
return document
|
174
|
+
rescue
|
175
|
+
p document
|
176
|
+
p document.uri
|
153
177
|
end
|
154
178
|
|
155
179
|
def UrlFilters.hostname_filter(document, include_hosts)
|
@@ -167,5 +191,14 @@ module RDig
|
|
167
191
|
return document
|
168
192
|
end
|
169
193
|
|
194
|
+
def UrlFilters.scheme_filter_file(document)
|
195
|
+
return document if (document.uri.scheme.nil? || document.uri.scheme =~ /^file$/i)
|
196
|
+
nil
|
197
|
+
end
|
198
|
+
def UrlFilters.scheme_filter_http(document)
|
199
|
+
return document if (document.uri.scheme.nil? || document.uri.scheme =~ /^https?$/i)
|
200
|
+
nil
|
201
|
+
end
|
202
|
+
|
170
203
|
end
|
171
204
|
end
|
Binary file
|
@@ -0,0 +1,32 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
class CrawlerFsTest < Test::Unit::TestCase
|
3
|
+
include TestHelper
|
4
|
+
|
5
|
+
def setup
|
6
|
+
@fixture_path = File.expand_path(File.join(File.dirname(__FILE__), '../fixtures/'))
|
7
|
+
index_dir = 'tmp/test-index'
|
8
|
+
Dir.mkdir index_dir unless File.directory? index_dir
|
9
|
+
RDig.configuration do |cfg|
|
10
|
+
@old_crawler_cfg = cfg.crawler.clone
|
11
|
+
cfg.crawler.start_urls = [ "file://#{@fixture_path}" ]
|
12
|
+
cfg.crawler.num_threads = 1
|
13
|
+
cfg.crawler.wait_before_leave = 1
|
14
|
+
cfg.index.path = index_dir
|
15
|
+
cfg.verbose = true
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def teardown
|
20
|
+
RDig.configuration do |cfg|
|
21
|
+
cfg.crawler = @old_crawler_cfg
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def test_crawl
|
26
|
+
crawler = Crawler.new
|
27
|
+
crawler.run
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
31
|
+
|
32
|
+
|
@@ -0,0 +1,34 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
class FileDocumentTest < Test::Unit::TestCase
|
3
|
+
include TestHelper
|
4
|
+
|
5
|
+
def setup
|
6
|
+
@fixture_path = File.join(File.expand_path(File.dirname(__FILE__)), '../fixtures/')
|
7
|
+
end
|
8
|
+
|
9
|
+
def test_find_files
|
10
|
+
links = FileDocument.find_files(@fixture_path)
|
11
|
+
assert_equal 3, links.size
|
12
|
+
links = FileDocument.find_files("#{@fixture_path}/html")
|
13
|
+
assert_equal 3, links.size
|
14
|
+
end
|
15
|
+
|
16
|
+
def test_fetch_directory
|
17
|
+
dir = Document.create("file://#{@fixture_path}")
|
18
|
+
dir.fetch
|
19
|
+
assert_equal 3, dir.links.size
|
20
|
+
dir = Document.create("file://#{@fixture_path}/pdf")
|
21
|
+
dir.fetch
|
22
|
+
assert_equal 1, dir.links.size
|
23
|
+
end
|
24
|
+
|
25
|
+
def test_fetch_content
|
26
|
+
file = Document.create("file://#{@fixture_path}/pdf/simple.pdf")
|
27
|
+
file.fetch
|
28
|
+
assert file.needs_indexing?
|
29
|
+
assert_equal 'This is for testing PDF extraction. Some Ümläuts and a €uro. Another Paragraph.', file.body
|
30
|
+
end
|
31
|
+
|
32
|
+
end
|
33
|
+
|
34
|
+
|
@@ -3,13 +3,9 @@ class HtmlContentExtractorTest < Test::Unit::TestCase
|
|
3
3
|
include TestHelper
|
4
4
|
|
5
5
|
def setup
|
6
|
-
@
|
6
|
+
@config = OpenStruct.new(:html => RDig.config.content_extraction.html.clone)
|
7
|
+
@extractor = ContentExtractors::HtmlContentExtractor.new(@config)
|
7
8
|
@nbsp = [160].pack('U') # non breaking space
|
8
|
-
@config_backup = RDig.config.content_extraction.html.clone
|
9
|
-
end
|
10
|
-
|
11
|
-
def teardown
|
12
|
-
RDig.config.content_extraction.html = @config_backup
|
13
9
|
end
|
14
10
|
|
15
11
|
def test_can_do
|
@@ -41,13 +37,11 @@ class HtmlContentExtractorTest < Test::Unit::TestCase
|
|
41
37
|
end
|
42
38
|
|
43
39
|
def test_custom_content_element
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
tagsoup.find('div', :attrs => { 'id', 'content' })
|
50
|
-
end
|
40
|
+
@config.html.title_tag_selector = lambda do |tagsoup|
|
41
|
+
tagsoup.find('h1', :attrs => { 'class', 'title' })
|
42
|
+
end
|
43
|
+
@config.html.content_tag_selector = lambda do |tagsoup|
|
44
|
+
tagsoup.find('div', :attrs => { 'id', 'content' })
|
51
45
|
end
|
52
46
|
result = @extractor.process(html_doc('custom_tag_selectors'))
|
53
47
|
assert_equal 'Sample Title in h1', result[:title]
|
@@ -61,23 +55,19 @@ class HtmlContentExtractorTest < Test::Unit::TestCase
|
|
61
55
|
|
62
56
|
|
63
57
|
def test_title_from_dcmeta
|
64
|
-
|
65
|
-
|
66
|
-
tagsoup.find('meta', :attrs => { 'name', 'DC.title' })['content']
|
67
|
-
end
|
58
|
+
@config.html.title_tag_selector = lambda do |tagsoup|
|
59
|
+
tagsoup.find('meta', :attrs => { 'name', 'DC.title' })['content']
|
68
60
|
end
|
69
61
|
result = @extractor.process(html_doc('custom_tag_selectors'))
|
70
62
|
assert_equal 'Title from DC meta data', result[:title]
|
71
63
|
end
|
72
64
|
|
73
65
|
def test_preprocessed_title
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
title =~ /^(.*)meta data$/ ? $1.strip : title.strip
|
80
|
-
end
|
66
|
+
@config.html.title_tag_selector = lambda do |tagsoup|
|
67
|
+
title = tagsoup.find('meta', :attrs => { 'name', 'DC.title' })['content']
|
68
|
+
# use only a portion of the title tag's contents if it matches our
|
69
|
+
# regexp:
|
70
|
+
title =~ /^(.*)meta data$/ ? $1.strip : title.strip
|
81
71
|
end
|
82
72
|
result = @extractor.process(html_doc('custom_tag_selectors'))
|
83
73
|
assert_equal 'Title from DC', result[:title]
|
@@ -3,7 +3,7 @@ class PdfContentExtractorTest < Test::Unit::TestCase
|
|
3
3
|
include TestHelper
|
4
4
|
|
5
5
|
def setup
|
6
|
-
@ce = ContentExtractors::PdfContentExtractor.new
|
6
|
+
@ce = ContentExtractors::PdfContentExtractor.new(RDig.configuration.content_extraction)
|
7
7
|
end
|
8
8
|
|
9
9
|
def test_can_do
|
@@ -23,10 +23,10 @@ class PdfContentExtractorTest < Test::Unit::TestCase
|
|
23
23
|
private
|
24
24
|
def check_content(result)
|
25
25
|
assert_not_nil result
|
26
|
-
|
26
|
+
assert_equal 'PDF Test', result[:title]
|
27
27
|
assert_nil result[:links]
|
28
28
|
assert_not_nil result[:content]
|
29
|
-
assert_equal 'This
|
29
|
+
assert_equal 'This is for testing PDF extraction. Some Ümläuts and a €uro. Another Paragraph.', result[:content]
|
30
30
|
end
|
31
31
|
|
32
32
|
end
|
@@ -13,17 +13,17 @@ class UrlFilterTest < Test::Unit::TestCase
|
|
13
13
|
]
|
14
14
|
chain = UrlFilters::FilterChain.new(cfg)
|
15
15
|
|
16
|
-
assert_nil chain.apply(Document.
|
17
|
-
assert_not_nil chain.apply(Document.
|
18
|
-
assert_nil chain.apply(Document.
|
16
|
+
assert_nil chain.apply(Document.create("http://test.host/affe.htm"))
|
17
|
+
assert_not_nil chain.apply(Document.create("http://test.host/affe.html"))
|
18
|
+
assert_nil chain.apply(Document.create("http://test.host.com/affe.html"))
|
19
19
|
end
|
20
20
|
|
21
21
|
# test default chain config
|
22
22
|
def test_default_filterchain
|
23
|
-
chain = UrlFilters::FilterChain.new(RDig.filter_chain)
|
24
|
-
assert_nil chain.apply(Document.
|
25
|
-
assert_not_nil chain.apply(Document.
|
26
|
-
assert_nil chain.apply(Document.
|
23
|
+
chain = UrlFilters::FilterChain.new(RDig.filter_chain[:http])
|
24
|
+
assert_nil chain.apply(Document.create("http://www.example.com/affe.htm"))
|
25
|
+
assert_not_nil chain.apply(Document.create("http://localhost:3000/affe.html"))
|
26
|
+
assert_nil chain.apply(Document.create("http://localhost.com/affe.html"))
|
27
27
|
end
|
28
28
|
|
29
29
|
# check lookup of chain parameters from config
|
@@ -38,59 +38,59 @@ class UrlFilterTest < Test::Unit::TestCase
|
|
38
38
|
]
|
39
39
|
chain = UrlFilters::FilterChain.new(cfg)
|
40
40
|
|
41
|
-
assert_nil chain.apply(Document.
|
42
|
-
assert_not_nil chain.apply(Document.
|
43
|
-
assert_nil chain.apply(Document.
|
41
|
+
assert_nil chain.apply(Document.create("http://test.host/affe.htm"))
|
42
|
+
assert_not_nil chain.apply(Document.create("http://test.host/affe.html"))
|
43
|
+
assert_nil chain.apply(Document.create("http://test.host.com/affe.html"))
|
44
44
|
end
|
45
45
|
|
46
46
|
def test_urlpattern_filter
|
47
47
|
f = UrlFilters::UrlInclusionFilter.new(/.*\.html$/)
|
48
|
-
assert_nil f.apply(Document.
|
49
|
-
assert_not_nil f.apply(Document.
|
48
|
+
assert_nil f.apply(Document.create("http://test.host/affe.htm"))
|
49
|
+
assert_not_nil f.apply(Document.create("http://test.host/affe.html"))
|
50
50
|
f = UrlFilters::UrlExclusionFilter.new([ /.*\.html$/, /.*\.aspx/ ])
|
51
|
-
assert_not_nil f.apply(Document.
|
52
|
-
assert_nil f.apply(Document.
|
53
|
-
assert_nil f.apply(Document.
|
51
|
+
assert_not_nil f.apply(Document.create("http://test.host/affe.htm"))
|
52
|
+
assert_nil f.apply(Document.create("http://test.host/affe.html"))
|
53
|
+
assert_nil f.apply(Document.create("http://test.host/affe.aspx"))
|
54
54
|
f = UrlFilters::UrlExclusionFilter.new([ /http:\/\/[^\/]+\/dir1/ ])
|
55
|
-
assert_nil f.apply(Document.
|
56
|
-
assert_not_nil f.apply(Document.
|
57
|
-
assert_not_nil f.apply(Document.
|
58
|
-
assert_not_nil f.apply(Document.
|
55
|
+
assert_nil f.apply(Document.create("http://test.host/dir1/affe.aspx"))
|
56
|
+
assert_not_nil f.apply(Document.create("http://test.host/dir2/dir1/affe.htm"))
|
57
|
+
assert_not_nil f.apply(Document.create("http://test.host/affe.htm"))
|
58
|
+
assert_not_nil f.apply(Document.create("http://test.host/dir2/affe.htm"))
|
59
59
|
f = UrlFilters::UrlExclusionFilter.new([ /\/dir1/ ])
|
60
|
-
assert_nil f.apply(Document.
|
61
|
-
assert_nil f.apply(Document.
|
62
|
-
assert_not_nil f.apply(Document.
|
63
|
-
assert_not_nil f.apply(Document.
|
60
|
+
assert_nil f.apply(Document.create("http://test.host/dir1/affe.aspx"))
|
61
|
+
assert_nil f.apply(Document.create("http://test.host/dir2/dir1/affe.htm"))
|
62
|
+
assert_not_nil f.apply(Document.create("http://test.host/affe.htm"))
|
63
|
+
assert_not_nil f.apply(Document.create("http://test.host/dir2/affe.htm"))
|
64
64
|
end
|
65
65
|
|
66
66
|
def test_hostname_filter
|
67
67
|
include_hosts = [ 'test.host', 'localhost' ]
|
68
|
-
assert_nil UrlFilters.hostname_filter(Document.
|
69
|
-
assert_not_nil UrlFilters.hostname_filter(Document.
|
70
|
-
assert_not_nil UrlFilters.hostname_filter(Document.
|
68
|
+
assert_nil UrlFilters.hostname_filter(Document.create('http://google.com/'), include_hosts)
|
69
|
+
assert_not_nil UrlFilters.hostname_filter(Document.create('http://test.host/file.html'), include_hosts)
|
70
|
+
assert_not_nil UrlFilters.hostname_filter(Document.create('http://localhost/file.html'), include_hosts)
|
71
71
|
end
|
72
72
|
|
73
73
|
def test_fix_relative_uri
|
74
|
-
doc = Document.
|
74
|
+
doc = Document.create('http://test.host/dir/file.html')
|
75
75
|
assert_equal('http://test.host/dir/another.html',
|
76
|
-
UrlFilters.fix_relative_uri(Document.
|
76
|
+
UrlFilters.fix_relative_uri(Document.create('another.html', doc.uri)).uri.to_s)
|
77
77
|
assert_equal('http://test.host/dir/../another.html',
|
78
|
-
UrlFilters.fix_relative_uri(Document.
|
78
|
+
UrlFilters.fix_relative_uri(Document.create('../another.html', doc.uri)).uri.to_s)
|
79
79
|
assert_equal('http://test.host/dir/another.html',
|
80
|
-
UrlFilters.fix_relative_uri(Document.
|
80
|
+
UrlFilters.fix_relative_uri(Document.create('/dir/another.html', doc.uri)).uri.to_s)
|
81
81
|
assert_equal('http://test.host/dir/another.html',
|
82
|
-
UrlFilters.fix_relative_uri(Document.
|
82
|
+
UrlFilters.fix_relative_uri(Document.create('http://test.host/dir/another.html', doc.uri)).uri.to_s)
|
83
83
|
assert_equal('HTTP://test.host/dir/another.html',
|
84
|
-
UrlFilters.fix_relative_uri(Document.
|
85
|
-
doc = Document.
|
84
|
+
UrlFilters.fix_relative_uri(Document.create('HTTP://test.host/dir/another.html', doc.uri)).uri.to_s)
|
85
|
+
doc = Document.create('https://test.host/dir/')
|
86
86
|
assert_equal('https://test.host/dir/another.html',
|
87
|
-
UrlFilters.fix_relative_uri(Document.
|
88
|
-
doc = Document.
|
87
|
+
UrlFilters.fix_relative_uri(Document.create('another.html', doc.uri)).uri.to_s)
|
88
|
+
doc = Document.create('https://test.host/')
|
89
89
|
assert_equal('https://test.host/another.html',
|
90
|
-
UrlFilters.fix_relative_uri(Document.
|
91
|
-
doc = Document.
|
90
|
+
UrlFilters.fix_relative_uri(Document.create('another.html', doc.uri)).uri.to_s)
|
91
|
+
doc = Document.create('https://test.host')
|
92
92
|
assert_equal('https://test.host/another.html',
|
93
|
-
UrlFilters.fix_relative_uri(Document.
|
93
|
+
UrlFilters.fix_relative_uri(Document.create('another.html', doc.uri)).uri.to_s)
|
94
94
|
end
|
95
95
|
end
|
96
96
|
|
metadata
CHANGED
@@ -1,10 +1,10 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
|
-
rubygems_version: 0.8.11
|
2
|
+
rubygems_version: 0.8.11.15
|
3
3
|
specification_version: 1
|
4
4
|
name: rdig
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.
|
7
|
-
date: 2006-04-
|
6
|
+
version: 0.3.0
|
7
|
+
date: 2006-04-26 00:00:00 +02:00
|
8
8
|
summary: Ruby based web site indexing and searching library.
|
9
9
|
require_paths:
|
10
10
|
- lib
|
@@ -25,6 +25,7 @@ required_ruby_version: !ruby/object:Gem::Version::Requirement
|
|
25
25
|
platform: ruby
|
26
26
|
signing_key:
|
27
27
|
cert_chain:
|
28
|
+
post_install_message:
|
28
29
|
authors:
|
29
30
|
- Jens Kraemer
|
30
31
|
files:
|
@@ -32,13 +33,14 @@ files:
|
|
32
33
|
- lib/rdig
|
33
34
|
- lib/htmlentities
|
34
35
|
- lib/rdig.rb
|
35
|
-
- lib/rdig/http_client.rb
|
36
36
|
- lib/rdig/crawler.rb
|
37
37
|
- lib/rdig/search.rb
|
38
38
|
- lib/rdig/highlight.rb
|
39
39
|
- lib/rdig/index.rb
|
40
40
|
- lib/rdig/url_filters.rb
|
41
41
|
- lib/rdig/content_extractors.rb
|
42
|
+
- lib/rdig/documents.rb
|
43
|
+
- lib/rdig/file.rb
|
42
44
|
- lib/htmlentities/CHANGES
|
43
45
|
- lib/htmlentities/COPYING
|
44
46
|
- lib/htmlentities/README
|
@@ -51,6 +53,8 @@ files:
|
|
51
53
|
- test/unit/html_content_extractor_test.rb
|
52
54
|
- test/unit/pdf_content_extractor_test.rb
|
53
55
|
- test/unit/word_content_extractor_test.rb
|
56
|
+
- test/unit/file_document_test.rb
|
57
|
+
- test/unit/crawler_fs_test.rb
|
54
58
|
- test/fixtures/html
|
55
59
|
- test/fixtures/pdf
|
56
60
|
- test/fixtures/word
|
data/lib/rdig/http_client.rb
DELETED
@@ -1,22 +0,0 @@
|
|
1
|
-
module RDig
|
2
|
-
|
3
|
-
module HttpClient
|
4
|
-
def do_get(uri, user_agent='RDig crawler')
|
5
|
-
# Set up the appropriate http headers
|
6
|
-
headers = { "User-Agent" => user_agent }
|
7
|
-
result = {}
|
8
|
-
|
9
|
-
begin
|
10
|
-
Net::HTTP.start(uri.host, (uri.port or 80)) { |http|
|
11
|
-
final_uri = uri.path
|
12
|
-
final_uri += ('?' + uri.query) if uri.query
|
13
|
-
return http.get(final_uri, headers)
|
14
|
-
}
|
15
|
-
rescue => error
|
16
|
-
puts error
|
17
|
-
end
|
18
|
-
end
|
19
|
-
end
|
20
|
-
|
21
|
-
end
|
22
|
-
|