rdig 0.2.1 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGES +14 -0
- data/doc/examples/config.rb +43 -7
- data/lib/rdig.rb +35 -20
- data/lib/rdig/content_extractors.rb +75 -37
- data/lib/rdig/crawler.rb +18 -91
- data/lib/rdig/documents.rb +133 -0
- data/lib/rdig/file.rb +18 -0
- data/lib/rdig/index.rb +6 -4
- data/lib/rdig/url_filters.rb +42 -9
- data/test/fixtures/pdf/simple.pdf +0 -0
- data/test/unit/crawler_fs_test.rb +32 -0
- data/test/unit/file_document_test.rb +34 -0
- data/test/unit/html_content_extractor_test.rb +14 -24
- data/test/unit/pdf_content_extractor_test.rb +3 -3
- data/test/unit/url_filters_test.rb +38 -38
- data/test/unit/word_content_extractor_test.rb +1 -1
- metadata +8 -4
- data/lib/rdig/http_client.rb +0 -22
data/CHANGES
CHANGED
@@ -1,3 +1,17 @@
|
|
1
|
+
0.3.0
|
2
|
+
- file system crawling
|
3
|
+
- optional url rewriting before indexing, e.g. for linking to results
|
4
|
+
via http and building the index directly from the file system
|
5
|
+
- PDF title extraction with pdfinfo
|
6
|
+
- removed dependency on mkmf which doesn't seem to exist in Ruby 1.8.2
|
7
|
+
- made content extractors more flexible - instances now use a given
|
8
|
+
configuration instead of the global one. This allows the
|
9
|
+
WordContentExtractor to use an HtmlContentExtractor with it's own
|
10
|
+
configuration that is independent of the global config.
|
11
|
+
|
12
|
+
0.2.1
|
13
|
+
- Bugfix release
|
14
|
+
|
1
15
|
0.2.0
|
2
16
|
- add pdf and Word content extraction capabilities using the tools
|
3
17
|
from the xpdf-utils and wv packages
|
data/doc/examples/config.rb
CHANGED
@@ -1,25 +1,36 @@
|
|
1
1
|
RDig.configuration do |cfg|
|
2
2
|
|
3
3
|
##################################################################
|
4
|
-
# options you should
|
4
|
+
# options you really should set
|
5
5
|
|
6
6
|
# provide one or more URLs for the crawler to start from
|
7
7
|
cfg.crawler.start_urls = [ 'http://www.example.com/' ]
|
8
8
|
|
9
|
+
# use something like this for crawling a file system:
|
10
|
+
# cfg.crawler.start_urls = [ 'file:///home/bob/documents/' ]
|
11
|
+
# beware, mixing file and http crawling is not possible and might result in
|
12
|
+
# unpredictable results.
|
13
|
+
|
9
14
|
# limit the crawl to these hosts. The crawler will never
|
10
15
|
# follow any links pointing to hosts other than those given here.
|
16
|
+
# ignored for file system crawling
|
11
17
|
cfg.crawler.include_hosts = [ 'www.example.com' ]
|
12
18
|
|
13
19
|
# this is the path where the index will be stored
|
14
20
|
# caution, existing contents of this directory will be deleted!
|
15
|
-
cfg.
|
21
|
+
cfg.indexer.path = '/path/to/index'
|
16
22
|
|
17
23
|
##################################################################
|
18
24
|
# options you might want to set, the given values are the defaults
|
25
|
+
|
26
|
+
# set to true to get stack traces on errors
|
27
|
+
# cfg.verbose = false
|
19
28
|
|
20
29
|
# content extraction options
|
21
30
|
|
22
|
-
# provide a method that
|
31
|
+
# provide a method that returns the title of an html document
|
32
|
+
# this method may either return a tag to extract the title from,
|
33
|
+
# or a ready-to-index string.
|
23
34
|
# cfg.content_extraction.html.title_tag_selector = lambda { |tagsoup| tagsoup.html.head.title }
|
24
35
|
|
25
36
|
# provide a method that selects the tag containing the page content you
|
@@ -29,8 +40,12 @@ RDig.configuration do |cfg|
|
|
29
40
|
|
30
41
|
# crawler options
|
31
42
|
|
32
|
-
#
|
33
|
-
#
|
43
|
+
# Notice: for file system crawling the include/exclude_document patterns are
|
44
|
+
# applied to the full path of _files_ only (like /home/bob/test.pdf),
|
45
|
+
# for http to full URIs (like http://example.com/index.html).
|
46
|
+
|
47
|
+
# nil (include all documents) or an array of Regexps
|
48
|
+
# matching the URLs you want to index.
|
34
49
|
# cfg.crawler.include_documents = nil
|
35
50
|
|
36
51
|
# nil (no documents excluded) or an array of Regexps
|
@@ -40,14 +55,35 @@ RDig.configuration do |cfg|
|
|
40
55
|
# included by the inclusion patterns.
|
41
56
|
# cfg.crawler.exclude_documents = nil
|
42
57
|
|
43
|
-
# number of
|
58
|
+
# number of document fetching threads to use. Should be raised only if
|
59
|
+
# your CPU has idle time when indexing.
|
44
60
|
# cfg.crawler.num_threads = 2
|
61
|
+
# suggested setting for file system crawling:
|
62
|
+
# cfg.crawler.num_threads = 1
|
45
63
|
|
46
64
|
# maximum number of http redirections to follow
|
47
65
|
# cfg.crawler.max_redirects = 5
|
48
66
|
|
49
67
|
# number of seconds to wait with an empty url queue before
|
50
|
-
# finishing the crawl. Set to a higher number
|
68
|
+
# finishing the crawl. Set to a higher number when experiencing incomplete
|
69
|
+
# crawls on slow sites. Don't set to 0, even when crawling a local fs.
|
51
70
|
# cfg.crawler.wait_before_leave = 10
|
71
|
+
|
72
|
+
# indexer options
|
73
|
+
|
74
|
+
# create a new index on each run. Will append to the index if false. Use when
|
75
|
+
# building a single index from multiple runs, e.g. one across a website and the
|
76
|
+
# other a tree in a local file system
|
77
|
+
# config.index.create = true
|
78
|
+
|
79
|
+
# rewrite document uris before indexing them. This is useful if you're
|
80
|
+
# indexing on disk, but the documents should be accessible via http, e.g. from
|
81
|
+
# a web based search application. By default, no rewriting takes place.
|
82
|
+
# example:
|
83
|
+
# cfg.index.rewrite_uri = lambda { |uri|
|
84
|
+
# uri.path.gsub!(/^\/base\//, '/virtual_dir/')
|
85
|
+
# uri.scheme = 'http'
|
86
|
+
# uri.host = 'www.mydomain.com'
|
87
|
+
# }
|
52
88
|
|
53
89
|
end
|
data/lib/rdig.rb
CHANGED
@@ -24,7 +24,7 @@
|
|
24
24
|
#++
|
25
25
|
#
|
26
26
|
|
27
|
-
RDIGVERSION = '0.
|
27
|
+
RDIGVERSION = '0.3.0'
|
28
28
|
|
29
29
|
|
30
30
|
require 'thread'
|
@@ -38,28 +38,28 @@ require 'set'
|
|
38
38
|
require 'net/http'
|
39
39
|
require 'getoptlong'
|
40
40
|
require 'tempfile'
|
41
|
-
|
42
|
-
# programs:
|
43
|
-
require 'mkmf'
|
41
|
+
require 'open-uri'
|
44
42
|
|
45
43
|
begin
|
46
|
-
require 'rubyful_soup'
|
47
44
|
require 'ferret'
|
45
|
+
require 'rubyful_soup'
|
48
46
|
rescue LoadError
|
49
47
|
require 'rubygems'
|
50
|
-
require 'rubyful_soup'
|
51
48
|
require 'ferret'
|
49
|
+
require 'rubyful_soup'
|
52
50
|
end
|
53
51
|
|
54
52
|
require 'htmlentities/htmlentities'
|
55
|
-
|
56
|
-
require 'rdig/http_client'
|
53
|
+
|
57
54
|
require 'rdig/content_extractors'
|
58
55
|
require 'rdig/url_filters'
|
59
56
|
require 'rdig/search'
|
60
57
|
require 'rdig/index'
|
58
|
+
require 'rdig/file'
|
59
|
+
require 'rdig/documents'
|
61
60
|
require 'rdig/crawler'
|
62
61
|
|
62
|
+
|
63
63
|
$KCODE = 'u'
|
64
64
|
require 'jcode'
|
65
65
|
|
@@ -68,17 +68,30 @@ module RDig
|
|
68
68
|
|
69
69
|
class << self
|
70
70
|
|
71
|
-
# the filter
|
71
|
+
# the filter chains are for limiting the set of indexed documents.
|
72
|
+
# there are two chain types - one for http, and one for file system
|
73
|
+
# crawling.
|
74
|
+
# a document has to survive all filters in the chain to get indexed.
|
72
75
|
def filter_chain
|
73
|
-
@filter_chain ||=
|
74
|
-
|
75
|
-
:
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
76
|
+
@filter_chain ||= {
|
77
|
+
# filter chain for http crawling
|
78
|
+
:http => [
|
79
|
+
:scheme_filter_http,
|
80
|
+
:fix_relative_uri,
|
81
|
+
:normalize_uri,
|
82
|
+
{ :hostname_filter => :include_hosts },
|
83
|
+
{ RDig::UrlFilters::UrlInclusionFilter => :include_documents },
|
84
|
+
{ RDig::UrlFilters::UrlExclusionFilter => :exclude_documents },
|
85
|
+
RDig::UrlFilters::VisitedUrlFilter
|
86
|
+
],
|
87
|
+
# filter chain for file system crawling
|
88
|
+
:file => [
|
89
|
+
:scheme_filter_file,
|
90
|
+
{ RDig::UrlFilters::PathInclusionFilter => :include_documents },
|
91
|
+
{ RDig::UrlFilters::PathExclusionFilter => :exclude_documents }
|
92
|
+
]
|
93
|
+
}
|
94
|
+
|
82
95
|
end
|
83
96
|
|
84
97
|
def application
|
@@ -86,7 +99,7 @@ module RDig
|
|
86
99
|
end
|
87
100
|
|
88
101
|
def searcher
|
89
|
-
@searcher ||= Search::Searcher.new(config.
|
102
|
+
@searcher ||= Search::Searcher.new(config.index)
|
90
103
|
end
|
91
104
|
|
92
105
|
# RDig configuration
|
@@ -124,7 +137,7 @@ module RDig
|
|
124
137
|
}
|
125
138
|
)
|
126
139
|
),
|
127
|
-
:
|
140
|
+
:index => OpenStruct.new(
|
128
141
|
:path => "index/",
|
129
142
|
:create => true,
|
130
143
|
:handle_parse_errors => true,
|
@@ -224,6 +237,8 @@ module RDig
|
|
224
237
|
|
225
238
|
end
|
226
239
|
|
240
|
+
puts "using Ferret #{Ferret::VERSION}"
|
241
|
+
|
227
242
|
if options.query
|
228
243
|
# query the index
|
229
244
|
puts "executing query >#{options.query}<"
|
@@ -54,7 +54,9 @@ module RDig
|
|
54
54
|
|
55
55
|
def self.extractors; @@extractors ||= [] end
|
56
56
|
def self.extractor_instances
|
57
|
-
@@extractor_instances ||= extractors.map { |ex_class|
|
57
|
+
@@extractor_instances ||= extractors.map { |ex_class|
|
58
|
+
ex_class.new(RDig.configuration.content_extraction)
|
59
|
+
}
|
58
60
|
end
|
59
61
|
|
60
62
|
def self.process(content, content_type)
|
@@ -65,6 +67,10 @@ module RDig
|
|
65
67
|
nil
|
66
68
|
end
|
67
69
|
|
70
|
+
def initialize(config)
|
71
|
+
@config = config
|
72
|
+
end
|
73
|
+
|
68
74
|
def can_do(content_type)
|
69
75
|
content_type =~ @pattern
|
70
76
|
end
|
@@ -91,60 +97,88 @@ module RDig
|
|
91
97
|
file.delete
|
92
98
|
end
|
93
99
|
|
94
|
-
|
95
|
-
|
96
|
-
@available = !find_executable(@executable).nil?
|
97
|
-
end
|
98
|
-
@available
|
99
|
-
end
|
100
|
-
|
100
|
+
# setting @available according to presence of external executables
|
101
|
+
# in initializer of ContentExtractor is needed to make this work
|
101
102
|
def can_do(content_type)
|
102
|
-
available and super(content_type)
|
103
|
+
@available and super(content_type)
|
103
104
|
end
|
104
105
|
end
|
105
106
|
|
106
107
|
# Extract text from pdf content.
|
107
108
|
#
|
108
|
-
# Requires the pdftotext
|
109
|
+
# Requires the pdftotext and pdfinfo utilities from the
|
110
|
+
# xpdf-utils package
|
109
111
|
# (on debian and friends do 'apt-get install xpdf-utils')
|
110
112
|
#
|
111
|
-
# TODO: use pdfinfo to get title from document
|
112
113
|
class PdfContentExtractor < ContentExtractor
|
113
114
|
include ExternalAppHelper
|
114
115
|
|
115
|
-
def initialize
|
116
|
-
|
116
|
+
def initialize(config)
|
117
|
+
super(config)
|
117
118
|
@pattern = /^application\/pdf/
|
119
|
+
@pdftotext = 'pdftotext'
|
120
|
+
@pdfinfo = 'pdfinfo'
|
121
|
+
@available = true
|
122
|
+
[ @pdftotext, @pdfinfo].each { |program|
|
123
|
+
unless %x{#{program} -h 2>&1} =~ /Copyright 1996/
|
124
|
+
@available = false
|
125
|
+
break
|
126
|
+
end
|
127
|
+
}
|
118
128
|
end
|
119
|
-
|
129
|
+
|
130
|
+
def process(content)
|
131
|
+
result = {}
|
132
|
+
as_file(content) do |file|
|
133
|
+
result[:content] = get_content(file.path).strip
|
134
|
+
result[:title] = get_title(file.path)
|
135
|
+
end
|
136
|
+
result
|
137
|
+
end
|
138
|
+
|
120
139
|
def get_content(path_to_tempfile)
|
121
|
-
%x{#{@
|
140
|
+
%x{#{@pdftotext} -enc UTF-8 '#{path_to_tempfile}' -}
|
141
|
+
end
|
142
|
+
|
143
|
+
# extracts the title from pdf meta data
|
144
|
+
# needs pdfinfo
|
145
|
+
# returns the title or nil if no title was found
|
146
|
+
def get_title(path_to_tempfile)
|
147
|
+
%x{#{@pdfinfo} -enc UTF-8 '#{path_to_tempfile}'} =~ /title:\s+(.*)$/i ? $1.strip : nil
|
148
|
+
rescue
|
122
149
|
end
|
123
150
|
end
|
124
151
|
|
125
152
|
# Extract text from word documents
|
126
153
|
#
|
127
|
-
# Requires the
|
128
|
-
# (on debian and friends do 'apt-get install
|
154
|
+
# Requires the wvHtml utility
|
155
|
+
# (on debian and friends do 'apt-get install wv')
|
129
156
|
class WordContentExtractor < ContentExtractor
|
130
157
|
include ExternalAppHelper
|
131
158
|
|
132
|
-
def initialize
|
133
|
-
|
159
|
+
def initialize(config)
|
160
|
+
super(config)
|
161
|
+
@wvhtml = 'wvHtml'
|
134
162
|
@pattern = /^application\/msword/
|
135
|
-
|
163
|
+
# html extractor for parsing wvHtml output
|
164
|
+
@html_extractor = HtmlContentExtractor.new(OpenStruct.new(
|
165
|
+
:html => OpenStruct.new(
|
166
|
+
:content_tag_selector => lambda { |tagsoup|
|
167
|
+
tagsoup.html.body
|
168
|
+
},
|
169
|
+
:title_tag_selector => lambda { |tagsoup|
|
170
|
+
tagsoup.html.head.title
|
171
|
+
}
|
172
|
+
)))
|
173
|
+
|
174
|
+
# TODO: besser: if $?.exitstatus == 127 (not found)
|
175
|
+
@available = %x{#{@wvhtml} -h 2>&1} =~ /Dom Lachowicz/
|
136
176
|
end
|
137
177
|
|
138
178
|
def process(content)
|
139
179
|
result = {}
|
140
|
-
as_file(content) do |
|
141
|
-
|
142
|
-
outfile.close
|
143
|
-
%x{#{@executable} --targetdir='#{File.dirname(outfile.path)}' '#{infile.path}' '#{File.basename(outfile.path)}'}
|
144
|
-
File.open(outfile.path) do |html|
|
145
|
-
result = @html_extractor.process(html.read)
|
146
|
-
end
|
147
|
-
outfile.delete
|
180
|
+
as_file(content) do |file|
|
181
|
+
result = @html_extractor.process(%x{#{@wvhtml} --charset=UTF-8 '#{file.path}' -})
|
148
182
|
end
|
149
183
|
return result || {}
|
150
184
|
end
|
@@ -154,7 +188,8 @@ module RDig
|
|
154
188
|
# extracts title, content and links from html documents
|
155
189
|
class HtmlContentExtractor < ContentExtractor
|
156
190
|
|
157
|
-
def initialize
|
191
|
+
def initialize(config)
|
192
|
+
super(config)
|
158
193
|
@pattern = /^(text\/(html|xml)|application\/(xhtml\+xml|xml))/
|
159
194
|
end
|
160
195
|
|
@@ -181,9 +216,10 @@ module RDig
|
|
181
216
|
# children.
|
182
217
|
def extract_content(tag_soup)
|
183
218
|
content = ''
|
184
|
-
content_element(tag_soup)
|
219
|
+
ce = content_element(tag_soup)
|
220
|
+
ce.children { |child|
|
185
221
|
extract_text(child, content)
|
186
|
-
}
|
222
|
+
} unless ce.nil?
|
187
223
|
return content.strip
|
188
224
|
end
|
189
225
|
|
@@ -197,18 +233,20 @@ module RDig
|
|
197
233
|
|
198
234
|
# Extracts the title from the given html tree
|
199
235
|
def extract_title(tagsoup)
|
200
|
-
title = ''
|
201
236
|
the_title_tag = title_tag(tagsoup)
|
202
237
|
if the_title_tag.is_a? String
|
203
238
|
the_title_tag
|
204
239
|
else
|
205
|
-
|
240
|
+
title = ''
|
241
|
+
extract_text(the_title_tag, title)
|
242
|
+
title.strip
|
206
243
|
end
|
207
244
|
end
|
208
245
|
|
209
246
|
# Recursively extracts all text contained in the given element,
|
210
247
|
# and appends it to content.
|
211
248
|
def extract_text(element, content='')
|
249
|
+
return nil if element.nil?
|
212
250
|
if element.is_a? NavigableString
|
213
251
|
value = strip_comments(element)
|
214
252
|
value.strip!
|
@@ -234,8 +272,8 @@ module RDig
|
|
234
272
|
# This may return a string, e.g. an attribute value selected from a meta
|
235
273
|
# tag, too.
|
236
274
|
def title_tag(tagsoup)
|
237
|
-
if
|
238
|
-
|
275
|
+
if @config.html.title_tag_selector
|
276
|
+
@config.html.title_tag_selector.call(tagsoup)
|
239
277
|
else
|
240
278
|
tagsoup.html.head.title
|
241
279
|
end
|
@@ -243,8 +281,8 @@ module RDig
|
|
243
281
|
|
244
282
|
# Retrieve the root element to extract document content from
|
245
283
|
def content_element(tagsoup)
|
246
|
-
if
|
247
|
-
|
284
|
+
if @config.html.content_tag_selector
|
285
|
+
@config.html.content_tag_selector.call(tagsoup)
|
248
286
|
else
|
249
287
|
tagsoup.html.body
|
250
288
|
end
|
data/lib/rdig/crawler.rb
CHANGED
@@ -9,30 +9,28 @@ module RDig
|
|
9
9
|
end
|
10
10
|
|
11
11
|
def run
|
12
|
-
|
13
|
-
|
12
|
+
raise 'no start urls given!' if RDig.config.crawler.start_urls.empty?
|
13
|
+
@indexer = Index::Indexer.new(RDig.config.index)
|
14
|
+
|
15
|
+
# check whether we are indexing on-disk or via http
|
16
|
+
url_type = RDig.config.crawler.start_urls.first =~ /^file:\/\// ? :file : :http
|
17
|
+
chain_config = RDig.filter_chain[url_type]
|
18
|
+
|
19
|
+
filterchain = UrlFilters::FilterChain.new(chain_config)
|
14
20
|
RDig.config.crawler.start_urls.each { |url| add_url(url, filterchain) }
|
15
21
|
|
16
22
|
num_threads = RDig.config.crawler.num_threads
|
17
23
|
group = ThreadsWait.new
|
18
24
|
num_threads.times { |i|
|
19
25
|
group.join_nowait Thread.new("fetcher #{i}") {
|
20
|
-
filterchain = UrlFilters::FilterChain.new(
|
26
|
+
filterchain = UrlFilters::FilterChain.new(chain_config)
|
21
27
|
while (doc = @documents.pop) != :exit
|
22
28
|
process_document doc, filterchain
|
23
29
|
end
|
24
30
|
}
|
25
31
|
}
|
26
32
|
|
27
|
-
#
|
28
|
-
# t1 pops the start url from the queue which now is empty
|
29
|
-
# as the queue is empty now, t2 blocks until t1 adds the links
|
30
|
-
# retrieved from his document.
|
31
|
-
#
|
32
|
-
# But we need the 'queue empty' condition as a sign for us to stop
|
33
|
-
# waiting for new entries, too.
|
34
|
-
|
35
|
-
# check every now and then for an empty queue
|
33
|
+
# check for an empty queue every now and then
|
36
34
|
sleep_interval = RDig.config.crawler.wait_before_leave
|
37
35
|
begin
|
38
36
|
sleep sleep_interval
|
@@ -54,22 +52,10 @@ module RDig
|
|
54
52
|
} unless doc.content[:links].nil?
|
55
53
|
|
56
54
|
return unless @etag_filter.apply(doc)
|
57
|
-
|
58
|
-
when :success
|
59
|
-
if doc.content
|
60
|
-
if doc.content[:links]
|
61
|
-
doc.content[:links].each { |url| add_url(url, filterchain, doc) }
|
62
|
-
end
|
63
|
-
@indexer << doc
|
64
|
-
#else
|
65
|
-
#puts "success but no content: #{doc.uri.to_s}"
|
66
|
-
end
|
67
|
-
when :redirect
|
68
|
-
# links contains the url we were redirected to
|
69
|
-
doc.content[:links].each { |url| add_url(url, filterchain, doc) }
|
70
|
-
end
|
55
|
+
@indexer << doc if doc.needs_indexing?
|
71
56
|
rescue
|
72
57
|
puts "error processing document #{doc.uri.to_s}: #{$!}"
|
58
|
+
puts "Trace: #{$!.backtrace.join("\n")}" if RDig::config.verbose
|
73
59
|
end
|
74
60
|
|
75
61
|
|
@@ -78,82 +64,23 @@ module RDig
|
|
78
64
|
# processing
|
79
65
|
def add_url(url, filterchain, referring_document = nil)
|
80
66
|
return if url.nil? || url.empty?
|
81
|
-
if referring_document
|
82
|
-
doc = Document.
|
83
|
-
# keep redirect count
|
84
|
-
if referring_document.status == :redirect
|
85
|
-
doc.redirections = referring_document.redirections + 1
|
86
|
-
end
|
67
|
+
if referring_document and referring_document.uri.scheme =~ /^https?/i
|
68
|
+
doc = Document.create(url, referring_document.uri)
|
87
69
|
else
|
88
|
-
doc = Document.
|
70
|
+
doc = Document.create(url)
|
89
71
|
end
|
90
72
|
|
91
73
|
doc = filterchain.apply(doc)
|
92
74
|
|
93
75
|
if doc
|
94
|
-
|
95
|
-
#
|
96
|
-
#puts "skipping url #{url}"
|
76
|
+
@documents << doc
|
77
|
+
puts "added url #{url}" if RDig::config.verbose
|
97
78
|
end
|
98
|
-
@documents << doc if doc
|
99
79
|
end
|
100
80
|
|
101
81
|
end
|
102
82
|
|
103
83
|
|
104
|
-
class Document
|
105
|
-
include HttpClient
|
106
|
-
|
107
|
-
attr_reader :content
|
108
|
-
attr_reader :content_type
|
109
|
-
attr_reader :uri
|
110
|
-
attr_reader :referring_uri
|
111
|
-
attr_reader :status
|
112
|
-
attr_reader :etag
|
113
|
-
attr_accessor :redirections
|
114
|
-
|
115
|
-
# url: url of this document, may be relative to the referring doc or host.
|
116
|
-
# referrer: uri of the document we retrieved this link from
|
117
|
-
def initialize(url, referrer = nil)
|
118
|
-
@redirections = 0
|
119
|
-
begin
|
120
|
-
@uri = URI.parse(url)
|
121
|
-
rescue URI::InvalidURIError
|
122
|
-
raise "Cannot create document using invalid URL: #{url}"
|
123
|
-
end
|
124
|
-
@referring_uri = referrer
|
125
|
-
end
|
126
|
-
|
127
|
-
def has_content?
|
128
|
-
!self.content.nil?
|
129
|
-
end
|
130
|
-
|
131
|
-
def title; @content[:title] end
|
132
|
-
def body; @content[:content] end
|
133
|
-
def url; @uri.to_s end
|
134
|
-
|
135
|
-
def fetch
|
136
|
-
puts "fetching #{@uri.to_s}"
|
137
|
-
response = do_get(@uri)
|
138
|
-
case response
|
139
|
-
when Net::HTTPSuccess
|
140
|
-
@content_type = response['content-type']
|
141
|
-
@raw_body = response.body
|
142
|
-
@etag = response['etag']
|
143
|
-
# todo externalize this (another chain ?)
|
144
|
-
@content = ContentExtractors.process(@raw_body, @content_type)
|
145
|
-
@status = :success
|
146
|
-
when Net::HTTPRedirection
|
147
|
-
@status = :redirect
|
148
|
-
@content = { :links => [ response['location'] ] }
|
149
|
-
else
|
150
|
-
puts "don't know what to do with response: #{response}"
|
151
|
-
end
|
152
|
-
|
153
|
-
end
|
154
|
-
|
155
|
-
end
|
156
|
-
|
157
84
|
# checks fetched documents' E-Tag headers against the list of E-Tags
|
158
85
|
# of the documents already indexed.
|
159
86
|
# This is supposed to help against double-indexing documents which can
|
@@ -169,7 +96,7 @@ module RDig
|
|
169
96
|
end
|
170
97
|
|
171
98
|
def apply(document)
|
172
|
-
return document unless document.etag
|
99
|
+
return document unless (document.respond_to?(:etag) && document.etag)
|
173
100
|
synchronize do
|
174
101
|
@etags.add?(document.etag) ? document : nil
|
175
102
|
end
|
@@ -0,0 +1,133 @@
|
|
1
|
+
module RDig
|
2
|
+
|
3
|
+
#
|
4
|
+
# Document base class
|
5
|
+
#
|
6
|
+
class Document
|
7
|
+
|
8
|
+
attr_reader :uri
|
9
|
+
attr_reader :content
|
10
|
+
attr_reader :content_type
|
11
|
+
|
12
|
+
def self.create(url, referrer_uri = nil)
|
13
|
+
# a referrer is a clear enough hint to create an HttpDocument
|
14
|
+
if referrer_uri && referrer_uri.scheme =~ /^https?$/i
|
15
|
+
return HttpDocument.new(:url => url, :referrer => referrer_uri)
|
16
|
+
end
|
17
|
+
|
18
|
+
case url
|
19
|
+
when /^https?:\/\//i
|
20
|
+
HttpDocument.new(:url => url, :referrer => referrer_uri) if referrer_uri.nil?
|
21
|
+
when /^file:\/\//i
|
22
|
+
# files don't have referrers - the check for nil prevents us from being
|
23
|
+
# tricked into indexing local files by file:// links in the web site
|
24
|
+
# we index.
|
25
|
+
FileDocument.new(:url => url) if referrer_uri.nil?
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
# url: url of this document, may be relative to the referring doc or host.
|
30
|
+
# referrer: uri of the document we retrieved this link from
|
31
|
+
def initialize(args)
|
32
|
+
begin
|
33
|
+
@uri = URI.parse(args[:url])
|
34
|
+
rescue URI::InvalidURIError
|
35
|
+
raise "Cannot create document using invalid URL: #{url}"
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def title; @content[:title] end
|
40
|
+
def body; @content[:content] end
|
41
|
+
def links; @content[:links] end
|
42
|
+
|
43
|
+
def needs_indexing?
|
44
|
+
has_content? && (title || body)
|
45
|
+
end
|
46
|
+
|
47
|
+
def has_content?
|
48
|
+
!self.content.nil?
|
49
|
+
end
|
50
|
+
|
51
|
+
end
|
52
|
+
|
53
|
+
|
54
|
+
#
|
55
|
+
# Document in a File system
|
56
|
+
#
|
57
|
+
class FileDocument < Document
|
58
|
+
def initialize(args={})
|
59
|
+
super(args)
|
60
|
+
end
|
61
|
+
|
62
|
+
def self.find_files(path)
|
63
|
+
links = []
|
64
|
+
Dir.glob(File.expand_path(File.join(path, '*'))) do |filename|
|
65
|
+
# Skip files not matching known mime types
|
66
|
+
pattern = /.+\.(#{File::FILE_EXTENSION_MIME_TYPES.keys.join('|')})$/i
|
67
|
+
if File.directory?(filename) || filename =~ pattern
|
68
|
+
links << "file://#{filename}"
|
69
|
+
end
|
70
|
+
end
|
71
|
+
links
|
72
|
+
end
|
73
|
+
|
74
|
+
def file?
|
75
|
+
File.file? @uri.path
|
76
|
+
end
|
77
|
+
|
78
|
+
def fetch
|
79
|
+
if File.directory? @uri.path
|
80
|
+
# directories are treated like a link collection
|
81
|
+
@content = { :links => self.class.find_files(@uri.path) }
|
82
|
+
else
|
83
|
+
# process this file's contents
|
84
|
+
open(@uri.path) do |file|
|
85
|
+
@content = ContentExtractors.process(file.read, file.content_type)
|
86
|
+
@content[:links] = nil if @content # don't follow links inside files
|
87
|
+
end
|
88
|
+
end
|
89
|
+
@content ||= {}
|
90
|
+
end
|
91
|
+
|
92
|
+
end
|
93
|
+
|
94
|
+
|
95
|
+
#
|
96
|
+
# Remote Document to be retrieved by HTTP
|
97
|
+
#
|
98
|
+
class HttpDocument < Document
|
99
|
+
|
100
|
+
attr_reader :referring_uri
|
101
|
+
attr_reader :status
|
102
|
+
attr_reader :etag
|
103
|
+
|
104
|
+
# url: url of this document, may be relative to the referring doc or host.
|
105
|
+
# referrer: uri of the document we retrieved this link from
|
106
|
+
def initialize(args={})
|
107
|
+
super(args)
|
108
|
+
@referring_uri = args[:referrer]
|
109
|
+
end
|
110
|
+
|
111
|
+
def fetch
|
112
|
+
puts "fetching #{@uri.to_s}" if RDig::config.verbose
|
113
|
+
open(@uri.to_s) do |doc|
|
114
|
+
case doc.status.first.to_i
|
115
|
+
when 200
|
116
|
+
@etag = doc.meta['etag']
|
117
|
+
# puts "etag: #{@etag}"
|
118
|
+
@content = ContentExtractors.process(doc.read, doc.content_type)
|
119
|
+
@status = :success
|
120
|
+
when 404
|
121
|
+
puts "got 404 for #{url}"
|
122
|
+
else
|
123
|
+
puts "don't know what to do with response: #{doc.status.join(' : ')}"
|
124
|
+
end
|
125
|
+
end
|
126
|
+
rescue
|
127
|
+
puts "error fetching #{@uri.to_s}: #{$!}" if RDig::config.verbose
|
128
|
+
ensure
|
129
|
+
@content ||= {}
|
130
|
+
end
|
131
|
+
|
132
|
+
end
|
133
|
+
end
|
data/lib/rdig/file.rb
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
# Extend class File with a content_type method
|
2
|
+
class File
|
3
|
+
|
4
|
+
# mime types and file extensions
|
5
|
+
FILE_EXTENSION_MIME_TYPES = {
|
6
|
+
'doc' => 'application/msword',
|
7
|
+
'html' => 'text/html',
|
8
|
+
'htm' => 'text/html',
|
9
|
+
#'.odt' => 'application/vnd.oasis.opendocument.text',
|
10
|
+
'pdf' => 'application/pdf',
|
11
|
+
'txt' => 'text/plain',
|
12
|
+
}
|
13
|
+
|
14
|
+
def content_type
|
15
|
+
FILE_EXTENSION_MIME_TYPES[File.extname(self.path).downcase.gsub(/^\./,'')] || 'application/octet-stream'
|
16
|
+
end
|
17
|
+
|
18
|
+
end
|
data/lib/rdig/index.rb
CHANGED
@@ -6,7 +6,7 @@ module RDig
|
|
6
6
|
include MonitorMixin, Ferret::Index, Ferret::Document
|
7
7
|
|
8
8
|
def initialize(settings)
|
9
|
-
|
9
|
+
@config = settings
|
10
10
|
@index_writer = IndexWriter.new(settings.path,
|
11
11
|
:create => settings.create,
|
12
12
|
:analyzer => settings.analyzer)
|
@@ -14,10 +14,12 @@ module RDig
|
|
14
14
|
end
|
15
15
|
|
16
16
|
def add_to_index(document)
|
17
|
-
puts "add to index: #{document.uri.to_s}"
|
17
|
+
puts "add to index: #{document.uri.to_s}" if RDig::config.verbose
|
18
18
|
doc = Ferret::Document::Document.new
|
19
|
-
|
20
|
-
|
19
|
+
@config.rewrite_uri.call(document.uri) if @config.rewrite_uri
|
20
|
+
|
21
|
+
doc << Field.new("url", document.uri.to_s,
|
22
|
+
Field::Store::YES, Field::Index::TOKENIZED)
|
21
23
|
doc << Field.new("title", document.title,
|
22
24
|
Field::Store::YES, Field::Index::TOKENIZED)
|
23
25
|
doc << Field.new("data", document.body,
|
data/lib/rdig/url_filters.rb
CHANGED
@@ -82,7 +82,7 @@ module RDig
|
|
82
82
|
|
83
83
|
|
84
84
|
# base class for url inclusion / exclusion filters
|
85
|
-
class
|
85
|
+
class PatternFilter
|
86
86
|
# takes an Array of Regexps, or nil to disable the filter
|
87
87
|
def initialize(args=nil)
|
88
88
|
unless args.nil?
|
@@ -98,8 +98,8 @@ module RDig
|
|
98
98
|
end
|
99
99
|
end
|
100
100
|
end
|
101
|
-
class UrlExclusionFilter <
|
102
|
-
# returns nil if any of the patterns matches it's
|
101
|
+
class UrlExclusionFilter < PatternFilter
|
102
|
+
# returns nil if any of the patterns matches it's URI,
|
103
103
|
# the document itself otherwise
|
104
104
|
def apply(document)
|
105
105
|
return document unless @patterns
|
@@ -109,9 +109,9 @@ module RDig
|
|
109
109
|
return document
|
110
110
|
end
|
111
111
|
end
|
112
|
-
class UrlInclusionFilter <
|
113
|
-
# returns
|
114
|
-
#
|
112
|
+
class UrlInclusionFilter < PatternFilter
|
113
|
+
# returns the document if any of the patterns matches it's URI,
|
114
|
+
# nil otherwise
|
115
115
|
def apply(document)
|
116
116
|
return document unless @patterns
|
117
117
|
@patterns.each { |p|
|
@@ -121,21 +121,42 @@ module RDig
|
|
121
121
|
end
|
122
122
|
end
|
123
123
|
|
124
|
-
|
124
|
+
# returns nil if any of the patterns matches it's path,
|
125
|
+
# the document itself otherwise. Applied to real files only.
|
126
|
+
class PathExclusionFilter < PatternFilter
|
127
|
+
def apply(document)
|
128
|
+
return document unless (@patterns && document.file?)
|
129
|
+
@patterns.each { |p|
|
130
|
+
return nil if document.uri.path =~ p
|
131
|
+
}
|
132
|
+
return document
|
133
|
+
end
|
134
|
+
end
|
135
|
+
# returns the document if any of the patterns matches it's path,
|
136
|
+
# nil otherwise. Applied to real files only
|
137
|
+
class PathInclusionFilter < PatternFilter
|
138
|
+
def apply(document)
|
139
|
+
return document unless (@patterns && document.file?)
|
140
|
+
@patterns.each { |p|
|
141
|
+
return document if document.uri.path =~ p
|
142
|
+
}
|
143
|
+
return nil
|
144
|
+
end
|
145
|
+
end
|
125
146
|
|
126
147
|
|
127
148
|
# checks redirect count of the given document
|
128
149
|
# takes it out of the chain if number of redirections exceeds the
|
129
150
|
# max_redirects setting
|
130
151
|
def UrlFilters.maximum_redirect_filter(document, max_redirects)
|
131
|
-
return nil if document.redirections > max_redirects
|
152
|
+
return nil if document.respond_to?(:redirections) && document.redirections > max_redirects
|
132
153
|
return document
|
133
154
|
end
|
134
155
|
|
135
156
|
# expands both href="/path/xyz.html" and href="affe.html"
|
136
157
|
# to full urls
|
137
158
|
def UrlFilters.fix_relative_uri(document)
|
138
|
-
return nil unless document.uri.scheme.nil? || document.uri.scheme =~ /^
|
159
|
+
#return nil unless document.uri.scheme.nil? || document.uri.scheme =~ /^https?/i
|
139
160
|
ref = document.referring_uri
|
140
161
|
return document unless ref
|
141
162
|
uri = document.uri
|
@@ -150,6 +171,9 @@ module RDig
|
|
150
171
|
uri.path = ref_path[0..ref_path.rindex('/')] + uri.path
|
151
172
|
end
|
152
173
|
return document
|
174
|
+
rescue
|
175
|
+
p document
|
176
|
+
p document.uri
|
153
177
|
end
|
154
178
|
|
155
179
|
def UrlFilters.hostname_filter(document, include_hosts)
|
@@ -167,5 +191,14 @@ module RDig
|
|
167
191
|
return document
|
168
192
|
end
|
169
193
|
|
194
|
+
def UrlFilters.scheme_filter_file(document)
|
195
|
+
return document if (document.uri.scheme.nil? || document.uri.scheme =~ /^file$/i)
|
196
|
+
nil
|
197
|
+
end
|
198
|
+
def UrlFilters.scheme_filter_http(document)
|
199
|
+
return document if (document.uri.scheme.nil? || document.uri.scheme =~ /^https?$/i)
|
200
|
+
nil
|
201
|
+
end
|
202
|
+
|
170
203
|
end
|
171
204
|
end
|
Binary file
|
@@ -0,0 +1,32 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
class CrawlerFsTest < Test::Unit::TestCase
|
3
|
+
include TestHelper
|
4
|
+
|
5
|
+
def setup
|
6
|
+
@fixture_path = File.expand_path(File.join(File.dirname(__FILE__), '../fixtures/'))
|
7
|
+
index_dir = 'tmp/test-index'
|
8
|
+
Dir.mkdir index_dir unless File.directory? index_dir
|
9
|
+
RDig.configuration do |cfg|
|
10
|
+
@old_crawler_cfg = cfg.crawler.clone
|
11
|
+
cfg.crawler.start_urls = [ "file://#{@fixture_path}" ]
|
12
|
+
cfg.crawler.num_threads = 1
|
13
|
+
cfg.crawler.wait_before_leave = 1
|
14
|
+
cfg.index.path = index_dir
|
15
|
+
cfg.verbose = true
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def teardown
|
20
|
+
RDig.configuration do |cfg|
|
21
|
+
cfg.crawler = @old_crawler_cfg
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def test_crawl
|
26
|
+
crawler = Crawler.new
|
27
|
+
crawler.run
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
31
|
+
|
32
|
+
|
@@ -0,0 +1,34 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
class FileDocumentTest < Test::Unit::TestCase
|
3
|
+
include TestHelper
|
4
|
+
|
5
|
+
def setup
|
6
|
+
@fixture_path = File.join(File.expand_path(File.dirname(__FILE__)), '../fixtures/')
|
7
|
+
end
|
8
|
+
|
9
|
+
def test_find_files
|
10
|
+
links = FileDocument.find_files(@fixture_path)
|
11
|
+
assert_equal 3, links.size
|
12
|
+
links = FileDocument.find_files("#{@fixture_path}/html")
|
13
|
+
assert_equal 3, links.size
|
14
|
+
end
|
15
|
+
|
16
|
+
def test_fetch_directory
|
17
|
+
dir = Document.create("file://#{@fixture_path}")
|
18
|
+
dir.fetch
|
19
|
+
assert_equal 3, dir.links.size
|
20
|
+
dir = Document.create("file://#{@fixture_path}/pdf")
|
21
|
+
dir.fetch
|
22
|
+
assert_equal 1, dir.links.size
|
23
|
+
end
|
24
|
+
|
25
|
+
def test_fetch_content
|
26
|
+
file = Document.create("file://#{@fixture_path}/pdf/simple.pdf")
|
27
|
+
file.fetch
|
28
|
+
assert file.needs_indexing?
|
29
|
+
assert_equal 'This is for testing PDF extraction. Some Ümläuts and a €uro. Another Paragraph.', file.body
|
30
|
+
end
|
31
|
+
|
32
|
+
end
|
33
|
+
|
34
|
+
|
@@ -3,13 +3,9 @@ class HtmlContentExtractorTest < Test::Unit::TestCase
|
|
3
3
|
include TestHelper
|
4
4
|
|
5
5
|
def setup
|
6
|
-
@
|
6
|
+
@config = OpenStruct.new(:html => RDig.config.content_extraction.html.clone)
|
7
|
+
@extractor = ContentExtractors::HtmlContentExtractor.new(@config)
|
7
8
|
@nbsp = [160].pack('U') # non breaking space
|
8
|
-
@config_backup = RDig.config.content_extraction.html.clone
|
9
|
-
end
|
10
|
-
|
11
|
-
def teardown
|
12
|
-
RDig.config.content_extraction.html = @config_backup
|
13
9
|
end
|
14
10
|
|
15
11
|
def test_can_do
|
@@ -41,13 +37,11 @@ class HtmlContentExtractorTest < Test::Unit::TestCase
|
|
41
37
|
end
|
42
38
|
|
43
39
|
def test_custom_content_element
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
tagsoup.find('div', :attrs => { 'id', 'content' })
|
50
|
-
end
|
40
|
+
@config.html.title_tag_selector = lambda do |tagsoup|
|
41
|
+
tagsoup.find('h1', :attrs => { 'class', 'title' })
|
42
|
+
end
|
43
|
+
@config.html.content_tag_selector = lambda do |tagsoup|
|
44
|
+
tagsoup.find('div', :attrs => { 'id', 'content' })
|
51
45
|
end
|
52
46
|
result = @extractor.process(html_doc('custom_tag_selectors'))
|
53
47
|
assert_equal 'Sample Title in h1', result[:title]
|
@@ -61,23 +55,19 @@ class HtmlContentExtractorTest < Test::Unit::TestCase
|
|
61
55
|
|
62
56
|
|
63
57
|
def test_title_from_dcmeta
|
64
|
-
|
65
|
-
|
66
|
-
tagsoup.find('meta', :attrs => { 'name', 'DC.title' })['content']
|
67
|
-
end
|
58
|
+
@config.html.title_tag_selector = lambda do |tagsoup|
|
59
|
+
tagsoup.find('meta', :attrs => { 'name', 'DC.title' })['content']
|
68
60
|
end
|
69
61
|
result = @extractor.process(html_doc('custom_tag_selectors'))
|
70
62
|
assert_equal 'Title from DC meta data', result[:title]
|
71
63
|
end
|
72
64
|
|
73
65
|
def test_preprocessed_title
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
title =~ /^(.*)meta data$/ ? $1.strip : title.strip
|
80
|
-
end
|
66
|
+
@config.html.title_tag_selector = lambda do |tagsoup|
|
67
|
+
title = tagsoup.find('meta', :attrs => { 'name', 'DC.title' })['content']
|
68
|
+
# use only a portion of the title tag's contents if it matches our
|
69
|
+
# regexp:
|
70
|
+
title =~ /^(.*)meta data$/ ? $1.strip : title.strip
|
81
71
|
end
|
82
72
|
result = @extractor.process(html_doc('custom_tag_selectors'))
|
83
73
|
assert_equal 'Title from DC', result[:title]
|
@@ -3,7 +3,7 @@ class PdfContentExtractorTest < Test::Unit::TestCase
|
|
3
3
|
include TestHelper
|
4
4
|
|
5
5
|
def setup
|
6
|
-
@ce = ContentExtractors::PdfContentExtractor.new
|
6
|
+
@ce = ContentExtractors::PdfContentExtractor.new(RDig.configuration.content_extraction)
|
7
7
|
end
|
8
8
|
|
9
9
|
def test_can_do
|
@@ -23,10 +23,10 @@ class PdfContentExtractorTest < Test::Unit::TestCase
|
|
23
23
|
private
|
24
24
|
def check_content(result)
|
25
25
|
assert_not_nil result
|
26
|
-
|
26
|
+
assert_equal 'PDF Test', result[:title]
|
27
27
|
assert_nil result[:links]
|
28
28
|
assert_not_nil result[:content]
|
29
|
-
assert_equal 'This
|
29
|
+
assert_equal 'This is for testing PDF extraction. Some Ümläuts and a €uro. Another Paragraph.', result[:content]
|
30
30
|
end
|
31
31
|
|
32
32
|
end
|
@@ -13,17 +13,17 @@ class UrlFilterTest < Test::Unit::TestCase
|
|
13
13
|
]
|
14
14
|
chain = UrlFilters::FilterChain.new(cfg)
|
15
15
|
|
16
|
-
assert_nil chain.apply(Document.
|
17
|
-
assert_not_nil chain.apply(Document.
|
18
|
-
assert_nil chain.apply(Document.
|
16
|
+
assert_nil chain.apply(Document.create("http://test.host/affe.htm"))
|
17
|
+
assert_not_nil chain.apply(Document.create("http://test.host/affe.html"))
|
18
|
+
assert_nil chain.apply(Document.create("http://test.host.com/affe.html"))
|
19
19
|
end
|
20
20
|
|
21
21
|
# test default chain config
|
22
22
|
def test_default_filterchain
|
23
|
-
chain = UrlFilters::FilterChain.new(RDig.filter_chain)
|
24
|
-
assert_nil chain.apply(Document.
|
25
|
-
assert_not_nil chain.apply(Document.
|
26
|
-
assert_nil chain.apply(Document.
|
23
|
+
chain = UrlFilters::FilterChain.new(RDig.filter_chain[:http])
|
24
|
+
assert_nil chain.apply(Document.create("http://www.example.com/affe.htm"))
|
25
|
+
assert_not_nil chain.apply(Document.create("http://localhost:3000/affe.html"))
|
26
|
+
assert_nil chain.apply(Document.create("http://localhost.com/affe.html"))
|
27
27
|
end
|
28
28
|
|
29
29
|
# check lookup of chain parameters from config
|
@@ -38,59 +38,59 @@ class UrlFilterTest < Test::Unit::TestCase
|
|
38
38
|
]
|
39
39
|
chain = UrlFilters::FilterChain.new(cfg)
|
40
40
|
|
41
|
-
assert_nil chain.apply(Document.
|
42
|
-
assert_not_nil chain.apply(Document.
|
43
|
-
assert_nil chain.apply(Document.
|
41
|
+
assert_nil chain.apply(Document.create("http://test.host/affe.htm"))
|
42
|
+
assert_not_nil chain.apply(Document.create("http://test.host/affe.html"))
|
43
|
+
assert_nil chain.apply(Document.create("http://test.host.com/affe.html"))
|
44
44
|
end
|
45
45
|
|
46
46
|
def test_urlpattern_filter
|
47
47
|
f = UrlFilters::UrlInclusionFilter.new(/.*\.html$/)
|
48
|
-
assert_nil f.apply(Document.
|
49
|
-
assert_not_nil f.apply(Document.
|
48
|
+
assert_nil f.apply(Document.create("http://test.host/affe.htm"))
|
49
|
+
assert_not_nil f.apply(Document.create("http://test.host/affe.html"))
|
50
50
|
f = UrlFilters::UrlExclusionFilter.new([ /.*\.html$/, /.*\.aspx/ ])
|
51
|
-
assert_not_nil f.apply(Document.
|
52
|
-
assert_nil f.apply(Document.
|
53
|
-
assert_nil f.apply(Document.
|
51
|
+
assert_not_nil f.apply(Document.create("http://test.host/affe.htm"))
|
52
|
+
assert_nil f.apply(Document.create("http://test.host/affe.html"))
|
53
|
+
assert_nil f.apply(Document.create("http://test.host/affe.aspx"))
|
54
54
|
f = UrlFilters::UrlExclusionFilter.new([ /http:\/\/[^\/]+\/dir1/ ])
|
55
|
-
assert_nil f.apply(Document.
|
56
|
-
assert_not_nil f.apply(Document.
|
57
|
-
assert_not_nil f.apply(Document.
|
58
|
-
assert_not_nil f.apply(Document.
|
55
|
+
assert_nil f.apply(Document.create("http://test.host/dir1/affe.aspx"))
|
56
|
+
assert_not_nil f.apply(Document.create("http://test.host/dir2/dir1/affe.htm"))
|
57
|
+
assert_not_nil f.apply(Document.create("http://test.host/affe.htm"))
|
58
|
+
assert_not_nil f.apply(Document.create("http://test.host/dir2/affe.htm"))
|
59
59
|
f = UrlFilters::UrlExclusionFilter.new([ /\/dir1/ ])
|
60
|
-
assert_nil f.apply(Document.
|
61
|
-
assert_nil f.apply(Document.
|
62
|
-
assert_not_nil f.apply(Document.
|
63
|
-
assert_not_nil f.apply(Document.
|
60
|
+
assert_nil f.apply(Document.create("http://test.host/dir1/affe.aspx"))
|
61
|
+
assert_nil f.apply(Document.create("http://test.host/dir2/dir1/affe.htm"))
|
62
|
+
assert_not_nil f.apply(Document.create("http://test.host/affe.htm"))
|
63
|
+
assert_not_nil f.apply(Document.create("http://test.host/dir2/affe.htm"))
|
64
64
|
end
|
65
65
|
|
66
66
|
def test_hostname_filter
|
67
67
|
include_hosts = [ 'test.host', 'localhost' ]
|
68
|
-
assert_nil UrlFilters.hostname_filter(Document.
|
69
|
-
assert_not_nil UrlFilters.hostname_filter(Document.
|
70
|
-
assert_not_nil UrlFilters.hostname_filter(Document.
|
68
|
+
assert_nil UrlFilters.hostname_filter(Document.create('http://google.com/'), include_hosts)
|
69
|
+
assert_not_nil UrlFilters.hostname_filter(Document.create('http://test.host/file.html'), include_hosts)
|
70
|
+
assert_not_nil UrlFilters.hostname_filter(Document.create('http://localhost/file.html'), include_hosts)
|
71
71
|
end
|
72
72
|
|
73
73
|
def test_fix_relative_uri
|
74
|
-
doc = Document.
|
74
|
+
doc = Document.create('http://test.host/dir/file.html')
|
75
75
|
assert_equal('http://test.host/dir/another.html',
|
76
|
-
UrlFilters.fix_relative_uri(Document.
|
76
|
+
UrlFilters.fix_relative_uri(Document.create('another.html', doc.uri)).uri.to_s)
|
77
77
|
assert_equal('http://test.host/dir/../another.html',
|
78
|
-
UrlFilters.fix_relative_uri(Document.
|
78
|
+
UrlFilters.fix_relative_uri(Document.create('../another.html', doc.uri)).uri.to_s)
|
79
79
|
assert_equal('http://test.host/dir/another.html',
|
80
|
-
UrlFilters.fix_relative_uri(Document.
|
80
|
+
UrlFilters.fix_relative_uri(Document.create('/dir/another.html', doc.uri)).uri.to_s)
|
81
81
|
assert_equal('http://test.host/dir/another.html',
|
82
|
-
UrlFilters.fix_relative_uri(Document.
|
82
|
+
UrlFilters.fix_relative_uri(Document.create('http://test.host/dir/another.html', doc.uri)).uri.to_s)
|
83
83
|
assert_equal('HTTP://test.host/dir/another.html',
|
84
|
-
UrlFilters.fix_relative_uri(Document.
|
85
|
-
doc = Document.
|
84
|
+
UrlFilters.fix_relative_uri(Document.create('HTTP://test.host/dir/another.html', doc.uri)).uri.to_s)
|
85
|
+
doc = Document.create('https://test.host/dir/')
|
86
86
|
assert_equal('https://test.host/dir/another.html',
|
87
|
-
UrlFilters.fix_relative_uri(Document.
|
88
|
-
doc = Document.
|
87
|
+
UrlFilters.fix_relative_uri(Document.create('another.html', doc.uri)).uri.to_s)
|
88
|
+
doc = Document.create('https://test.host/')
|
89
89
|
assert_equal('https://test.host/another.html',
|
90
|
-
UrlFilters.fix_relative_uri(Document.
|
91
|
-
doc = Document.
|
90
|
+
UrlFilters.fix_relative_uri(Document.create('another.html', doc.uri)).uri.to_s)
|
91
|
+
doc = Document.create('https://test.host')
|
92
92
|
assert_equal('https://test.host/another.html',
|
93
|
-
UrlFilters.fix_relative_uri(Document.
|
93
|
+
UrlFilters.fix_relative_uri(Document.create('another.html', doc.uri)).uri.to_s)
|
94
94
|
end
|
95
95
|
end
|
96
96
|
|
metadata
CHANGED
@@ -1,10 +1,10 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
|
-
rubygems_version: 0.8.11
|
2
|
+
rubygems_version: 0.8.11.15
|
3
3
|
specification_version: 1
|
4
4
|
name: rdig
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.
|
7
|
-
date: 2006-04-
|
6
|
+
version: 0.3.0
|
7
|
+
date: 2006-04-26 00:00:00 +02:00
|
8
8
|
summary: Ruby based web site indexing and searching library.
|
9
9
|
require_paths:
|
10
10
|
- lib
|
@@ -25,6 +25,7 @@ required_ruby_version: !ruby/object:Gem::Version::Requirement
|
|
25
25
|
platform: ruby
|
26
26
|
signing_key:
|
27
27
|
cert_chain:
|
28
|
+
post_install_message:
|
28
29
|
authors:
|
29
30
|
- Jens Kraemer
|
30
31
|
files:
|
@@ -32,13 +33,14 @@ files:
|
|
32
33
|
- lib/rdig
|
33
34
|
- lib/htmlentities
|
34
35
|
- lib/rdig.rb
|
35
|
-
- lib/rdig/http_client.rb
|
36
36
|
- lib/rdig/crawler.rb
|
37
37
|
- lib/rdig/search.rb
|
38
38
|
- lib/rdig/highlight.rb
|
39
39
|
- lib/rdig/index.rb
|
40
40
|
- lib/rdig/url_filters.rb
|
41
41
|
- lib/rdig/content_extractors.rb
|
42
|
+
- lib/rdig/documents.rb
|
43
|
+
- lib/rdig/file.rb
|
42
44
|
- lib/htmlentities/CHANGES
|
43
45
|
- lib/htmlentities/COPYING
|
44
46
|
- lib/htmlentities/README
|
@@ -51,6 +53,8 @@ files:
|
|
51
53
|
- test/unit/html_content_extractor_test.rb
|
52
54
|
- test/unit/pdf_content_extractor_test.rb
|
53
55
|
- test/unit/word_content_extractor_test.rb
|
56
|
+
- test/unit/file_document_test.rb
|
57
|
+
- test/unit/crawler_fs_test.rb
|
54
58
|
- test/fixtures/html
|
55
59
|
- test/fixtures/pdf
|
56
60
|
- test/fixtures/word
|
data/lib/rdig/http_client.rb
DELETED
@@ -1,22 +0,0 @@
|
|
1
|
-
module RDig
|
2
|
-
|
3
|
-
module HttpClient
|
4
|
-
def do_get(uri, user_agent='RDig crawler')
|
5
|
-
# Set up the appropriate http headers
|
6
|
-
headers = { "User-Agent" => user_agent }
|
7
|
-
result = {}
|
8
|
-
|
9
|
-
begin
|
10
|
-
Net::HTTP.start(uri.host, (uri.port or 80)) { |http|
|
11
|
-
final_uri = uri.path
|
12
|
-
final_uri += ('?' + uri.query) if uri.query
|
13
|
-
return http.get(final_uri, headers)
|
14
|
-
}
|
15
|
-
rescue => error
|
16
|
-
puts error
|
17
|
-
end
|
18
|
-
end
|
19
|
-
end
|
20
|
-
|
21
|
-
end
|
22
|
-
|