rdig 0.2.1 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGES CHANGED
@@ -1,3 +1,17 @@
1
+ 0.3.0
2
+ - file system crawling
3
+ - optional url rewriting before indexing, e.g. for linking to results
4
+ via http and building the index directly from the file system
5
+ - PDF title extraction with pdfinfo
6
+ - removed dependency on mkmf which doesn't seem to exist in Ruby 1.8.2
7
+ - made content extractors more flexible - instances now use a given
8
+ configuration instead of the global one. This allows the
9
+ WordContentExtractor to use an HtmlContentExtractor with it's own
10
+ configuration that is independent of the global config.
11
+
12
+ 0.2.1
13
+ - Bugfix release
14
+
1
15
  0.2.0
2
16
  - add pdf and Word content extraction capabilities using the tools
3
17
  from the xpdf-utils and wv packages
@@ -1,25 +1,36 @@
1
1
  RDig.configuration do |cfg|
2
2
 
3
3
  ##################################################################
4
- # options you should really set
4
+ # options you really should set
5
5
 
6
6
  # provide one or more URLs for the crawler to start from
7
7
  cfg.crawler.start_urls = [ 'http://www.example.com/' ]
8
8
 
9
+ # use something like this for crawling a file system:
10
+ # cfg.crawler.start_urls = [ 'file:///home/bob/documents/' ]
11
+ # beware, mixing file and http crawling is not possible and might result in
12
+ # unpredictable results.
13
+
9
14
  # limit the crawl to these hosts. The crawler will never
10
15
  # follow any links pointing to hosts other than those given here.
16
+ # ignored for file system crawling
11
17
  cfg.crawler.include_hosts = [ 'www.example.com' ]
12
18
 
13
19
  # this is the path where the index will be stored
14
20
  # caution, existing contents of this directory will be deleted!
15
- cfg.ferret.path = '/path/to/index'
21
+ cfg.indexer.path = '/path/to/index'
16
22
 
17
23
  ##################################################################
18
24
  # options you might want to set, the given values are the defaults
25
+
26
+ # set to true to get stack traces on errors
27
+ # cfg.verbose = false
19
28
 
20
29
  # content extraction options
21
30
 
22
- # provide a method that selects the tag containing the title of a document
31
+ # provide a method that returns the title of an html document
32
+ # this method may either return a tag to extract the title from,
33
+ # or a ready-to-index string.
23
34
  # cfg.content_extraction.html.title_tag_selector = lambda { |tagsoup| tagsoup.html.head.title }
24
35
 
25
36
  # provide a method that selects the tag containing the page content you
@@ -29,8 +40,12 @@ RDig.configuration do |cfg|
29
40
 
30
41
  # crawler options
31
42
 
32
- # nil (index all documents) or an array of Regexps
33
- # matching URLs you want to index.
43
+ # Notice: for file system crawling the include/exclude_document patterns are
44
+ # applied to the full path of _files_ only (like /home/bob/test.pdf),
45
+ # for http to full URIs (like http://example.com/index.html).
46
+
47
+ # nil (include all documents) or an array of Regexps
48
+ # matching the URLs you want to index.
34
49
  # cfg.crawler.include_documents = nil
35
50
 
36
51
  # nil (no documents excluded) or an array of Regexps
@@ -40,14 +55,35 @@ RDig.configuration do |cfg|
40
55
  # included by the inclusion patterns.
41
56
  # cfg.crawler.exclude_documents = nil
42
57
 
43
- # number of http fetching threads to use
58
+ # number of document fetching threads to use. Should be raised only if
59
+ # your CPU has idle time when indexing.
44
60
  # cfg.crawler.num_threads = 2
61
+ # suggested setting for file system crawling:
62
+ # cfg.crawler.num_threads = 1
45
63
 
46
64
  # maximum number of http redirections to follow
47
65
  # cfg.crawler.max_redirects = 5
48
66
 
49
67
  # number of seconds to wait with an empty url queue before
50
- # finishing the crawl. Set to a higher number for slow sites
68
+ # finishing the crawl. Set to a higher number when experiencing incomplete
69
+ # crawls on slow sites. Don't set to 0, even when crawling a local fs.
51
70
  # cfg.crawler.wait_before_leave = 10
71
+
72
+ # indexer options
73
+
74
+ # create a new index on each run. Will append to the index if false. Use when
75
+ # building a single index from multiple runs, e.g. one across a website and the
76
+ # other a tree in a local file system
77
+ # config.index.create = true
78
+
79
+ # rewrite document uris before indexing them. This is useful if you're
80
+ # indexing on disk, but the documents should be accessible via http, e.g. from
81
+ # a web based search application. By default, no rewriting takes place.
82
+ # example:
83
+ # cfg.index.rewrite_uri = lambda { |uri|
84
+ # uri.path.gsub!(/^\/base\//, '/virtual_dir/')
85
+ # uri.scheme = 'http'
86
+ # uri.host = 'www.mydomain.com'
87
+ # }
52
88
 
53
89
  end
@@ -24,7 +24,7 @@
24
24
  #++
25
25
  #
26
26
 
27
- RDIGVERSION = '0.2.1'
27
+ RDIGVERSION = '0.3.0'
28
28
 
29
29
 
30
30
  require 'thread'
@@ -38,28 +38,28 @@ require 'set'
38
38
  require 'net/http'
39
39
  require 'getoptlong'
40
40
  require 'tempfile'
41
- # mkmf gives us the handy find_executable method used to check for helper
42
- # programs:
43
- require 'mkmf'
41
+ require 'open-uri'
44
42
 
45
43
  begin
46
- require 'rubyful_soup'
47
44
  require 'ferret'
45
+ require 'rubyful_soup'
48
46
  rescue LoadError
49
47
  require 'rubygems'
50
- require 'rubyful_soup'
51
48
  require 'ferret'
49
+ require 'rubyful_soup'
52
50
  end
53
51
 
54
52
  require 'htmlentities/htmlentities'
55
-
56
- require 'rdig/http_client'
53
+
57
54
  require 'rdig/content_extractors'
58
55
  require 'rdig/url_filters'
59
56
  require 'rdig/search'
60
57
  require 'rdig/index'
58
+ require 'rdig/file'
59
+ require 'rdig/documents'
61
60
  require 'rdig/crawler'
62
61
 
62
+
63
63
  $KCODE = 'u'
64
64
  require 'jcode'
65
65
 
@@ -68,17 +68,30 @@ module RDig
68
68
 
69
69
  class << self
70
70
 
71
- # the filter chain each URL has to run through before being crawled.
71
+ # the filter chains are for limiting the set of indexed documents.
72
+ # there are two chain types - one for http, and one for file system
73
+ # crawling.
74
+ # a document has to survive all filters in the chain to get indexed.
72
75
  def filter_chain
73
- @filter_chain ||= [
74
- { :maximum_redirect_filter => :max_redirects },
75
- :fix_relative_uri,
76
- :normalize_uri,
77
- { :hostname_filter => :include_hosts },
78
- { RDig::UrlFilters::UrlInclusionFilter => :include_documents },
79
- { RDig::UrlFilters::UrlExclusionFilter => :exclude_documents },
80
- RDig::UrlFilters::VisitedUrlFilter
81
- ]
76
+ @filter_chain ||= {
77
+ # filter chain for http crawling
78
+ :http => [
79
+ :scheme_filter_http,
80
+ :fix_relative_uri,
81
+ :normalize_uri,
82
+ { :hostname_filter => :include_hosts },
83
+ { RDig::UrlFilters::UrlInclusionFilter => :include_documents },
84
+ { RDig::UrlFilters::UrlExclusionFilter => :exclude_documents },
85
+ RDig::UrlFilters::VisitedUrlFilter
86
+ ],
87
+ # filter chain for file system crawling
88
+ :file => [
89
+ :scheme_filter_file,
90
+ { RDig::UrlFilters::PathInclusionFilter => :include_documents },
91
+ { RDig::UrlFilters::PathExclusionFilter => :exclude_documents }
92
+ ]
93
+ }
94
+
82
95
  end
83
96
 
84
97
  def application
@@ -86,7 +99,7 @@ module RDig
86
99
  end
87
100
 
88
101
  def searcher
89
- @searcher ||= Search::Searcher.new(config.ferret)
102
+ @searcher ||= Search::Searcher.new(config.index)
90
103
  end
91
104
 
92
105
  # RDig configuration
@@ -124,7 +137,7 @@ module RDig
124
137
  }
125
138
  )
126
139
  ),
127
- :ferret => OpenStruct.new(
140
+ :index => OpenStruct.new(
128
141
  :path => "index/",
129
142
  :create => true,
130
143
  :handle_parse_errors => true,
@@ -224,6 +237,8 @@ module RDig
224
237
 
225
238
  end
226
239
 
240
+ puts "using Ferret #{Ferret::VERSION}"
241
+
227
242
  if options.query
228
243
  # query the index
229
244
  puts "executing query >#{options.query}<"
@@ -54,7 +54,9 @@ module RDig
54
54
 
55
55
  def self.extractors; @@extractors ||= [] end
56
56
  def self.extractor_instances
57
- @@extractor_instances ||= extractors.map { |ex_class| ex_class.new }
57
+ @@extractor_instances ||= extractors.map { |ex_class|
58
+ ex_class.new(RDig.configuration.content_extraction)
59
+ }
58
60
  end
59
61
 
60
62
  def self.process(content, content_type)
@@ -65,6 +67,10 @@ module RDig
65
67
  nil
66
68
  end
67
69
 
70
+ def initialize(config)
71
+ @config = config
72
+ end
73
+
68
74
  def can_do(content_type)
69
75
  content_type =~ @pattern
70
76
  end
@@ -91,60 +97,88 @@ module RDig
91
97
  file.delete
92
98
  end
93
99
 
94
- def available
95
- if @available.nil?
96
- @available = !find_executable(@executable).nil?
97
- end
98
- @available
99
- end
100
-
100
+ # setting @available according to presence of external executables
101
+ # in initializer of ContentExtractor is needed to make this work
101
102
  def can_do(content_type)
102
- available and super(content_type)
103
+ @available and super(content_type)
103
104
  end
104
105
  end
105
106
 
106
107
  # Extract text from pdf content.
107
108
  #
108
- # Requires the pdftotext utility from the xpdf-utils package
109
+ # Requires the pdftotext and pdfinfo utilities from the
110
+ # xpdf-utils package
109
111
  # (on debian and friends do 'apt-get install xpdf-utils')
110
112
  #
111
- # TODO: use pdfinfo to get title from document
112
113
  class PdfContentExtractor < ContentExtractor
113
114
  include ExternalAppHelper
114
115
 
115
- def initialize
116
- @executable = 'pdftotext'
116
+ def initialize(config)
117
+ super(config)
117
118
  @pattern = /^application\/pdf/
119
+ @pdftotext = 'pdftotext'
120
+ @pdfinfo = 'pdfinfo'
121
+ @available = true
122
+ [ @pdftotext, @pdfinfo].each { |program|
123
+ unless %x{#{program} -h 2>&1} =~ /Copyright 1996/
124
+ @available = false
125
+ break
126
+ end
127
+ }
118
128
  end
119
-
129
+
130
+ def process(content)
131
+ result = {}
132
+ as_file(content) do |file|
133
+ result[:content] = get_content(file.path).strip
134
+ result[:title] = get_title(file.path)
135
+ end
136
+ result
137
+ end
138
+
120
139
  def get_content(path_to_tempfile)
121
- %x{#{@executable} '#{path_to_tempfile}' -}
140
+ %x{#{@pdftotext} -enc UTF-8 '#{path_to_tempfile}' -}
141
+ end
142
+
143
+ # extracts the title from pdf meta data
144
+ # needs pdfinfo
145
+ # returns the title or nil if no title was found
146
+ def get_title(path_to_tempfile)
147
+ %x{#{@pdfinfo} -enc UTF-8 '#{path_to_tempfile}'} =~ /title:\s+(.*)$/i ? $1.strip : nil
148
+ rescue
122
149
  end
123
150
  end
124
151
 
125
152
  # Extract text from word documents
126
153
  #
127
- # Requires the antiword utility
128
- # (on debian and friends do 'apt-get install antiword')
154
+ # Requires the wvHtml utility
155
+ # (on debian and friends do 'apt-get install wv')
129
156
  class WordContentExtractor < ContentExtractor
130
157
  include ExternalAppHelper
131
158
 
132
- def initialize
133
- @executable = 'wvHtml'
159
+ def initialize(config)
160
+ super(config)
161
+ @wvhtml = 'wvHtml'
134
162
  @pattern = /^application\/msword/
135
- @html_extractor = HtmlContentExtractor.new
163
+ # html extractor for parsing wvHtml output
164
+ @html_extractor = HtmlContentExtractor.new(OpenStruct.new(
165
+ :html => OpenStruct.new(
166
+ :content_tag_selector => lambda { |tagsoup|
167
+ tagsoup.html.body
168
+ },
169
+ :title_tag_selector => lambda { |tagsoup|
170
+ tagsoup.html.head.title
171
+ }
172
+ )))
173
+
174
+ # TODO: besser: if $?.exitstatus == 127 (not found)
175
+ @available = %x{#{@wvhtml} -h 2>&1} =~ /Dom Lachowicz/
136
176
  end
137
177
 
138
178
  def process(content)
139
179
  result = {}
140
- as_file(content) do |infile|
141
- outfile = Tempfile.new('rdig')
142
- outfile.close
143
- %x{#{@executable} --targetdir='#{File.dirname(outfile.path)}' '#{infile.path}' '#{File.basename(outfile.path)}'}
144
- File.open(outfile.path) do |html|
145
- result = @html_extractor.process(html.read)
146
- end
147
- outfile.delete
180
+ as_file(content) do |file|
181
+ result = @html_extractor.process(%x{#{@wvhtml} --charset=UTF-8 '#{file.path}' -})
148
182
  end
149
183
  return result || {}
150
184
  end
@@ -154,7 +188,8 @@ module RDig
154
188
  # extracts title, content and links from html documents
155
189
  class HtmlContentExtractor < ContentExtractor
156
190
 
157
- def initialize
191
+ def initialize(config)
192
+ super(config)
158
193
  @pattern = /^(text\/(html|xml)|application\/(xhtml\+xml|xml))/
159
194
  end
160
195
 
@@ -181,9 +216,10 @@ module RDig
181
216
  # children.
182
217
  def extract_content(tag_soup)
183
218
  content = ''
184
- content_element(tag_soup).children { |child|
219
+ ce = content_element(tag_soup)
220
+ ce.children { |child|
185
221
  extract_text(child, content)
186
- }
222
+ } unless ce.nil?
187
223
  return content.strip
188
224
  end
189
225
 
@@ -197,18 +233,20 @@ module RDig
197
233
 
198
234
  # Extracts the title from the given html tree
199
235
  def extract_title(tagsoup)
200
- title = ''
201
236
  the_title_tag = title_tag(tagsoup)
202
237
  if the_title_tag.is_a? String
203
238
  the_title_tag
204
239
  else
205
- extract_text(the_title_tag).strip if the_title_tag
240
+ title = ''
241
+ extract_text(the_title_tag, title)
242
+ title.strip
206
243
  end
207
244
  end
208
245
 
209
246
  # Recursively extracts all text contained in the given element,
210
247
  # and appends it to content.
211
248
  def extract_text(element, content='')
249
+ return nil if element.nil?
212
250
  if element.is_a? NavigableString
213
251
  value = strip_comments(element)
214
252
  value.strip!
@@ -234,8 +272,8 @@ module RDig
234
272
  # This may return a string, e.g. an attribute value selected from a meta
235
273
  # tag, too.
236
274
  def title_tag(tagsoup)
237
- if RDig.config.content_extraction.html.title_tag_selector
238
- RDig.config.content_extraction.html.title_tag_selector.call(tagsoup)
275
+ if @config.html.title_tag_selector
276
+ @config.html.title_tag_selector.call(tagsoup)
239
277
  else
240
278
  tagsoup.html.head.title
241
279
  end
@@ -243,8 +281,8 @@ module RDig
243
281
 
244
282
  # Retrieve the root element to extract document content from
245
283
  def content_element(tagsoup)
246
- if RDig.config.content_extraction.html.content_tag_selector
247
- RDig.config.content_extraction.html.content_tag_selector.call(tagsoup)
284
+ if @config.html.content_tag_selector
285
+ @config.html.content_tag_selector.call(tagsoup)
248
286
  else
249
287
  tagsoup.html.body
250
288
  end
@@ -9,30 +9,28 @@ module RDig
9
9
  end
10
10
 
11
11
  def run
12
- @indexer = Index::Indexer.new(RDig.config.ferret)
13
- filterchain = UrlFilters::FilterChain.new(RDig.filter_chain)
12
+ raise 'no start urls given!' if RDig.config.crawler.start_urls.empty?
13
+ @indexer = Index::Indexer.new(RDig.config.index)
14
+
15
+ # check whether we are indexing on-disk or via http
16
+ url_type = RDig.config.crawler.start_urls.first =~ /^file:\/\// ? :file : :http
17
+ chain_config = RDig.filter_chain[url_type]
18
+
19
+ filterchain = UrlFilters::FilterChain.new(chain_config)
14
20
  RDig.config.crawler.start_urls.each { |url| add_url(url, filterchain) }
15
21
 
16
22
  num_threads = RDig.config.crawler.num_threads
17
23
  group = ThreadsWait.new
18
24
  num_threads.times { |i|
19
25
  group.join_nowait Thread.new("fetcher #{i}") {
20
- filterchain = UrlFilters::FilterChain.new(RDig.filter_chain)
26
+ filterchain = UrlFilters::FilterChain.new(chain_config)
21
27
  while (doc = @documents.pop) != :exit
22
28
  process_document doc, filterchain
23
29
  end
24
30
  }
25
31
  }
26
32
 
27
- # dilemma: suppose we have 1 start url and two threads t1 and t2:
28
- # t1 pops the start url from the queue which now is empty
29
- # as the queue is empty now, t2 blocks until t1 adds the links
30
- # retrieved from his document.
31
- #
32
- # But we need the 'queue empty' condition as a sign for us to stop
33
- # waiting for new entries, too.
34
-
35
- # check every now and then for an empty queue
33
+ # check for an empty queue every now and then
36
34
  sleep_interval = RDig.config.crawler.wait_before_leave
37
35
  begin
38
36
  sleep sleep_interval
@@ -54,22 +52,10 @@ module RDig
54
52
  } unless doc.content[:links].nil?
55
53
 
56
54
  return unless @etag_filter.apply(doc)
57
- case doc.status
58
- when :success
59
- if doc.content
60
- if doc.content[:links]
61
- doc.content[:links].each { |url| add_url(url, filterchain, doc) }
62
- end
63
- @indexer << doc
64
- #else
65
- #puts "success but no content: #{doc.uri.to_s}"
66
- end
67
- when :redirect
68
- # links contains the url we were redirected to
69
- doc.content[:links].each { |url| add_url(url, filterchain, doc) }
70
- end
55
+ @indexer << doc if doc.needs_indexing?
71
56
  rescue
72
57
  puts "error processing document #{doc.uri.to_s}: #{$!}"
58
+ puts "Trace: #{$!.backtrace.join("\n")}" if RDig::config.verbose
73
59
  end
74
60
 
75
61
 
@@ -78,82 +64,23 @@ module RDig
78
64
  # processing
79
65
  def add_url(url, filterchain, referring_document = nil)
80
66
  return if url.nil? || url.empty?
81
- if referring_document
82
- doc = Document.new(url, referring_document.uri)
83
- # keep redirect count
84
- if referring_document.status == :redirect
85
- doc.redirections = referring_document.redirections + 1
86
- end
67
+ if referring_document and referring_document.uri.scheme =~ /^https?/i
68
+ doc = Document.create(url, referring_document.uri)
87
69
  else
88
- doc = Document.new(url)
70
+ doc = Document.create(url)
89
71
  end
90
72
 
91
73
  doc = filterchain.apply(doc)
92
74
 
93
75
  if doc
94
- puts "added url #{url}"
95
- #else
96
- #puts "skipping url #{url}"
76
+ @documents << doc
77
+ puts "added url #{url}" if RDig::config.verbose
97
78
  end
98
- @documents << doc if doc
99
79
  end
100
80
 
101
81
  end
102
82
 
103
83
 
104
- class Document
105
- include HttpClient
106
-
107
- attr_reader :content
108
- attr_reader :content_type
109
- attr_reader :uri
110
- attr_reader :referring_uri
111
- attr_reader :status
112
- attr_reader :etag
113
- attr_accessor :redirections
114
-
115
- # url: url of this document, may be relative to the referring doc or host.
116
- # referrer: uri of the document we retrieved this link from
117
- def initialize(url, referrer = nil)
118
- @redirections = 0
119
- begin
120
- @uri = URI.parse(url)
121
- rescue URI::InvalidURIError
122
- raise "Cannot create document using invalid URL: #{url}"
123
- end
124
- @referring_uri = referrer
125
- end
126
-
127
- def has_content?
128
- !self.content.nil?
129
- end
130
-
131
- def title; @content[:title] end
132
- def body; @content[:content] end
133
- def url; @uri.to_s end
134
-
135
- def fetch
136
- puts "fetching #{@uri.to_s}"
137
- response = do_get(@uri)
138
- case response
139
- when Net::HTTPSuccess
140
- @content_type = response['content-type']
141
- @raw_body = response.body
142
- @etag = response['etag']
143
- # todo externalize this (another chain ?)
144
- @content = ContentExtractors.process(@raw_body, @content_type)
145
- @status = :success
146
- when Net::HTTPRedirection
147
- @status = :redirect
148
- @content = { :links => [ response['location'] ] }
149
- else
150
- puts "don't know what to do with response: #{response}"
151
- end
152
-
153
- end
154
-
155
- end
156
-
157
84
  # checks fetched documents' E-Tag headers against the list of E-Tags
158
85
  # of the documents already indexed.
159
86
  # This is supposed to help against double-indexing documents which can
@@ -169,7 +96,7 @@ module RDig
169
96
  end
170
97
 
171
98
  def apply(document)
172
- return document unless document.etag
99
+ return document unless (document.respond_to?(:etag) && document.etag)
173
100
  synchronize do
174
101
  @etags.add?(document.etag) ? document : nil
175
102
  end
@@ -0,0 +1,133 @@
1
+ module RDig
2
+
3
+ #
4
+ # Document base class
5
+ #
6
+ class Document
7
+
8
+ attr_reader :uri
9
+ attr_reader :content
10
+ attr_reader :content_type
11
+
12
+ def self.create(url, referrer_uri = nil)
13
+ # a referrer is a clear enough hint to create an HttpDocument
14
+ if referrer_uri && referrer_uri.scheme =~ /^https?$/i
15
+ return HttpDocument.new(:url => url, :referrer => referrer_uri)
16
+ end
17
+
18
+ case url
19
+ when /^https?:\/\//i
20
+ HttpDocument.new(:url => url, :referrer => referrer_uri) if referrer_uri.nil?
21
+ when /^file:\/\//i
22
+ # files don't have referrers - the check for nil prevents us from being
23
+ # tricked into indexing local files by file:// links in the web site
24
+ # we index.
25
+ FileDocument.new(:url => url) if referrer_uri.nil?
26
+ end
27
+ end
28
+
29
+ # url: url of this document, may be relative to the referring doc or host.
30
+ # referrer: uri of the document we retrieved this link from
31
+ def initialize(args)
32
+ begin
33
+ @uri = URI.parse(args[:url])
34
+ rescue URI::InvalidURIError
35
+ raise "Cannot create document using invalid URL: #{url}"
36
+ end
37
+ end
38
+
39
+ def title; @content[:title] end
40
+ def body; @content[:content] end
41
+ def links; @content[:links] end
42
+
43
+ def needs_indexing?
44
+ has_content? && (title || body)
45
+ end
46
+
47
+ def has_content?
48
+ !self.content.nil?
49
+ end
50
+
51
+ end
52
+
53
+
54
+ #
55
+ # Document in a File system
56
+ #
57
+ class FileDocument < Document
58
+ def initialize(args={})
59
+ super(args)
60
+ end
61
+
62
+ def self.find_files(path)
63
+ links = []
64
+ Dir.glob(File.expand_path(File.join(path, '*'))) do |filename|
65
+ # Skip files not matching known mime types
66
+ pattern = /.+\.(#{File::FILE_EXTENSION_MIME_TYPES.keys.join('|')})$/i
67
+ if File.directory?(filename) || filename =~ pattern
68
+ links << "file://#{filename}"
69
+ end
70
+ end
71
+ links
72
+ end
73
+
74
+ def file?
75
+ File.file? @uri.path
76
+ end
77
+
78
+ def fetch
79
+ if File.directory? @uri.path
80
+ # directories are treated like a link collection
81
+ @content = { :links => self.class.find_files(@uri.path) }
82
+ else
83
+ # process this file's contents
84
+ open(@uri.path) do |file|
85
+ @content = ContentExtractors.process(file.read, file.content_type)
86
+ @content[:links] = nil if @content # don't follow links inside files
87
+ end
88
+ end
89
+ @content ||= {}
90
+ end
91
+
92
+ end
93
+
94
+
95
+ #
96
+ # Remote Document to be retrieved by HTTP
97
+ #
98
+ class HttpDocument < Document
99
+
100
+ attr_reader :referring_uri
101
+ attr_reader :status
102
+ attr_reader :etag
103
+
104
+ # url: url of this document, may be relative to the referring doc or host.
105
+ # referrer: uri of the document we retrieved this link from
106
+ def initialize(args={})
107
+ super(args)
108
+ @referring_uri = args[:referrer]
109
+ end
110
+
111
+ def fetch
112
+ puts "fetching #{@uri.to_s}" if RDig::config.verbose
113
+ open(@uri.to_s) do |doc|
114
+ case doc.status.first.to_i
115
+ when 200
116
+ @etag = doc.meta['etag']
117
+ # puts "etag: #{@etag}"
118
+ @content = ContentExtractors.process(doc.read, doc.content_type)
119
+ @status = :success
120
+ when 404
121
+ puts "got 404 for #{url}"
122
+ else
123
+ puts "don't know what to do with response: #{doc.status.join(' : ')}"
124
+ end
125
+ end
126
+ rescue
127
+ puts "error fetching #{@uri.to_s}: #{$!}" if RDig::config.verbose
128
+ ensure
129
+ @content ||= {}
130
+ end
131
+
132
+ end
133
+ end
@@ -0,0 +1,18 @@
1
+ # Extend class File with a content_type method
2
+ class File
3
+
4
+ # mime types and file extensions
5
+ FILE_EXTENSION_MIME_TYPES = {
6
+ 'doc' => 'application/msword',
7
+ 'html' => 'text/html',
8
+ 'htm' => 'text/html',
9
+ #'.odt' => 'application/vnd.oasis.opendocument.text',
10
+ 'pdf' => 'application/pdf',
11
+ 'txt' => 'text/plain',
12
+ }
13
+
14
+ def content_type
15
+ FILE_EXTENSION_MIME_TYPES[File.extname(self.path).downcase.gsub(/^\./,'')] || 'application/octet-stream'
16
+ end
17
+
18
+ end
@@ -6,7 +6,7 @@ module RDig
6
6
  include MonitorMixin, Ferret::Index, Ferret::Document
7
7
 
8
8
  def initialize(settings)
9
- #@ferret_config = settings
9
+ @config = settings
10
10
  @index_writer = IndexWriter.new(settings.path,
11
11
  :create => settings.create,
12
12
  :analyzer => settings.analyzer)
@@ -14,10 +14,12 @@ module RDig
14
14
  end
15
15
 
16
16
  def add_to_index(document)
17
- puts "add to index: #{document.uri.to_s}"
17
+ puts "add to index: #{document.uri.to_s}" if RDig::config.verbose
18
18
  doc = Ferret::Document::Document.new
19
- doc << Field.new("url", document.url,
20
- Field::Store::YES, Field::Index::UNTOKENIZED)
19
+ @config.rewrite_uri.call(document.uri) if @config.rewrite_uri
20
+
21
+ doc << Field.new("url", document.uri.to_s,
22
+ Field::Store::YES, Field::Index::TOKENIZED)
21
23
  doc << Field.new("title", document.title,
22
24
  Field::Store::YES, Field::Index::TOKENIZED)
23
25
  doc << Field.new("data", document.body,
@@ -82,7 +82,7 @@ module RDig
82
82
 
83
83
 
84
84
  # base class for url inclusion / exclusion filters
85
- class UrlPatternFilter
85
+ class PatternFilter
86
86
  # takes an Array of Regexps, or nil to disable the filter
87
87
  def initialize(args=nil)
88
88
  unless args.nil?
@@ -98,8 +98,8 @@ module RDig
98
98
  end
99
99
  end
100
100
  end
101
- class UrlExclusionFilter < UrlPatternFilter
102
- # returns nil if any of the patterns matches it's URL,
101
+ class UrlExclusionFilter < PatternFilter
102
+ # returns nil if any of the patterns matches it's URI,
103
103
  # the document itself otherwise
104
104
  def apply(document)
105
105
  return document unless @patterns
@@ -109,9 +109,9 @@ module RDig
109
109
  return document
110
110
  end
111
111
  end
112
- class UrlInclusionFilter < UrlPatternFilter
113
- # returns nil if any of the patterns matches it's URL,
114
- # the document itself otherwise
112
+ class UrlInclusionFilter < PatternFilter
113
+ # returns the document if any of the patterns matches it's URI,
114
+ # nil otherwise
115
115
  def apply(document)
116
116
  return document unless @patterns
117
117
  @patterns.each { |p|
@@ -121,21 +121,42 @@ module RDig
121
121
  end
122
122
  end
123
123
 
124
-
124
+ # returns nil if any of the patterns matches it's path,
125
+ # the document itself otherwise. Applied to real files only.
126
+ class PathExclusionFilter < PatternFilter
127
+ def apply(document)
128
+ return document unless (@patterns && document.file?)
129
+ @patterns.each { |p|
130
+ return nil if document.uri.path =~ p
131
+ }
132
+ return document
133
+ end
134
+ end
135
+ # returns the document if any of the patterns matches it's path,
136
+ # nil otherwise. Applied to real files only
137
+ class PathInclusionFilter < PatternFilter
138
+ def apply(document)
139
+ return document unless (@patterns && document.file?)
140
+ @patterns.each { |p|
141
+ return document if document.uri.path =~ p
142
+ }
143
+ return nil
144
+ end
145
+ end
125
146
 
126
147
 
127
148
  # checks redirect count of the given document
128
149
  # takes it out of the chain if number of redirections exceeds the
129
150
  # max_redirects setting
130
151
  def UrlFilters.maximum_redirect_filter(document, max_redirects)
131
- return nil if document.redirections > max_redirects
152
+ return nil if document.respond_to?(:redirections) && document.redirections > max_redirects
132
153
  return document
133
154
  end
134
155
 
135
156
  # expands both href="/path/xyz.html" and href="affe.html"
136
157
  # to full urls
137
158
  def UrlFilters.fix_relative_uri(document)
138
- return nil unless document.uri.scheme.nil? || document.uri.scheme =~ /^http/i
159
+ #return nil unless document.uri.scheme.nil? || document.uri.scheme =~ /^https?/i
139
160
  ref = document.referring_uri
140
161
  return document unless ref
141
162
  uri = document.uri
@@ -150,6 +171,9 @@ module RDig
150
171
  uri.path = ref_path[0..ref_path.rindex('/')] + uri.path
151
172
  end
152
173
  return document
174
+ rescue
175
+ p document
176
+ p document.uri
153
177
  end
154
178
 
155
179
  def UrlFilters.hostname_filter(document, include_hosts)
@@ -167,5 +191,14 @@ module RDig
167
191
  return document
168
192
  end
169
193
 
194
+ def UrlFilters.scheme_filter_file(document)
195
+ return document if (document.uri.scheme.nil? || document.uri.scheme =~ /^file$/i)
196
+ nil
197
+ end
198
+ def UrlFilters.scheme_filter_http(document)
199
+ return document if (document.uri.scheme.nil? || document.uri.scheme =~ /^https?$/i)
200
+ nil
201
+ end
202
+
170
203
  end
171
204
  end
Binary file
@@ -0,0 +1,32 @@
1
+ require 'test_helper'
2
+ class CrawlerFsTest < Test::Unit::TestCase
3
+ include TestHelper
4
+
5
+ def setup
6
+ @fixture_path = File.expand_path(File.join(File.dirname(__FILE__), '../fixtures/'))
7
+ index_dir = 'tmp/test-index'
8
+ Dir.mkdir index_dir unless File.directory? index_dir
9
+ RDig.configuration do |cfg|
10
+ @old_crawler_cfg = cfg.crawler.clone
11
+ cfg.crawler.start_urls = [ "file://#{@fixture_path}" ]
12
+ cfg.crawler.num_threads = 1
13
+ cfg.crawler.wait_before_leave = 1
14
+ cfg.index.path = index_dir
15
+ cfg.verbose = true
16
+ end
17
+ end
18
+
19
+ def teardown
20
+ RDig.configuration do |cfg|
21
+ cfg.crawler = @old_crawler_cfg
22
+ end
23
+ end
24
+
25
+ def test_crawl
26
+ crawler = Crawler.new
27
+ crawler.run
28
+ end
29
+
30
+ end
31
+
32
+
@@ -0,0 +1,34 @@
1
+ require 'test_helper'
2
+ class FileDocumentTest < Test::Unit::TestCase
3
+ include TestHelper
4
+
5
+ def setup
6
+ @fixture_path = File.join(File.expand_path(File.dirname(__FILE__)), '../fixtures/')
7
+ end
8
+
9
+ def test_find_files
10
+ links = FileDocument.find_files(@fixture_path)
11
+ assert_equal 3, links.size
12
+ links = FileDocument.find_files("#{@fixture_path}/html")
13
+ assert_equal 3, links.size
14
+ end
15
+
16
+ def test_fetch_directory
17
+ dir = Document.create("file://#{@fixture_path}")
18
+ dir.fetch
19
+ assert_equal 3, dir.links.size
20
+ dir = Document.create("file://#{@fixture_path}/pdf")
21
+ dir.fetch
22
+ assert_equal 1, dir.links.size
23
+ end
24
+
25
+ def test_fetch_content
26
+ file = Document.create("file://#{@fixture_path}/pdf/simple.pdf")
27
+ file.fetch
28
+ assert file.needs_indexing?
29
+ assert_equal 'This is for testing PDF extraction. Some Ümläuts and a €uro. Another Paragraph.', file.body
30
+ end
31
+
32
+ end
33
+
34
+
@@ -3,13 +3,9 @@ class HtmlContentExtractorTest < Test::Unit::TestCase
3
3
  include TestHelper
4
4
 
5
5
  def setup
6
- @extractor = ContentExtractors::HtmlContentExtractor.new
6
+ @config = OpenStruct.new(:html => RDig.config.content_extraction.html.clone)
7
+ @extractor = ContentExtractors::HtmlContentExtractor.new(@config)
7
8
  @nbsp = [160].pack('U') # non breaking space
8
- @config_backup = RDig.config.content_extraction.html.clone
9
- end
10
-
11
- def teardown
12
- RDig.config.content_extraction.html = @config_backup
13
9
  end
14
10
 
15
11
  def test_can_do
@@ -41,13 +37,11 @@ class HtmlContentExtractorTest < Test::Unit::TestCase
41
37
  end
42
38
 
43
39
  def test_custom_content_element
44
- RDig.configuration do |config|
45
- config.content_extraction.html.title_tag_selector = lambda do |tagsoup|
46
- tagsoup.find('h1', :attrs => { 'class', 'title' })
47
- end
48
- config.content_extraction.html.content_tag_selector = lambda do |tagsoup|
49
- tagsoup.find('div', :attrs => { 'id', 'content' })
50
- end
40
+ @config.html.title_tag_selector = lambda do |tagsoup|
41
+ tagsoup.find('h1', :attrs => { 'class', 'title' })
42
+ end
43
+ @config.html.content_tag_selector = lambda do |tagsoup|
44
+ tagsoup.find('div', :attrs => { 'id', 'content' })
51
45
  end
52
46
  result = @extractor.process(html_doc('custom_tag_selectors'))
53
47
  assert_equal 'Sample Title in h1', result[:title]
@@ -61,23 +55,19 @@ class HtmlContentExtractorTest < Test::Unit::TestCase
61
55
 
62
56
 
63
57
  def test_title_from_dcmeta
64
- RDig.configuration do |config|
65
- config.content_extraction.html.title_tag_selector = lambda do |tagsoup|
66
- tagsoup.find('meta', :attrs => { 'name', 'DC.title' })['content']
67
- end
58
+ @config.html.title_tag_selector = lambda do |tagsoup|
59
+ tagsoup.find('meta', :attrs => { 'name', 'DC.title' })['content']
68
60
  end
69
61
  result = @extractor.process(html_doc('custom_tag_selectors'))
70
62
  assert_equal 'Title from DC meta data', result[:title]
71
63
  end
72
64
 
73
65
  def test_preprocessed_title
74
- RDig.configuration do |config|
75
- config.content_extraction.html.title_tag_selector = lambda do |tagsoup|
76
- title = tagsoup.find('meta', :attrs => { 'name', 'DC.title' })['content']
77
- # use only a portion of the title tag's contents if it matches our
78
- # regexp:
79
- title =~ /^(.*)meta data$/ ? $1.strip : title.strip
80
- end
66
+ @config.html.title_tag_selector = lambda do |tagsoup|
67
+ title = tagsoup.find('meta', :attrs => { 'name', 'DC.title' })['content']
68
+ # use only a portion of the title tag's contents if it matches our
69
+ # regexp:
70
+ title =~ /^(.*)meta data$/ ? $1.strip : title.strip
81
71
  end
82
72
  result = @extractor.process(html_doc('custom_tag_selectors'))
83
73
  assert_equal 'Title from DC', result[:title]
@@ -3,7 +3,7 @@ class PdfContentExtractorTest < Test::Unit::TestCase
3
3
  include TestHelper
4
4
 
5
5
  def setup
6
- @ce = ContentExtractors::PdfContentExtractor.new
6
+ @ce = ContentExtractors::PdfContentExtractor.new(RDig.configuration.content_extraction)
7
7
  end
8
8
 
9
9
  def test_can_do
@@ -23,10 +23,10 @@ class PdfContentExtractorTest < Test::Unit::TestCase
23
23
  private
24
24
  def check_content(result)
25
25
  assert_not_nil result
26
- assert_nil result[:title]
26
+ assert_equal 'PDF Test', result[:title]
27
27
  assert_nil result[:links]
28
28
  assert_not_nil result[:content]
29
- assert_equal 'This is for testing PDF extraction. Another Paragraph.', result[:content]
29
+ assert_equal 'This is for testing PDF extraction. Some Ümläuts and a €uro. Another Paragraph.', result[:content]
30
30
  end
31
31
 
32
32
  end
@@ -13,17 +13,17 @@ class UrlFilterTest < Test::Unit::TestCase
13
13
  ]
14
14
  chain = UrlFilters::FilterChain.new(cfg)
15
15
 
16
- assert_nil chain.apply(Document.new("http://test.host/affe.htm"))
17
- assert_not_nil chain.apply(Document.new("http://test.host/affe.html"))
18
- assert_nil chain.apply(Document.new("http://test.host.com/affe.html"))
16
+ assert_nil chain.apply(Document.create("http://test.host/affe.htm"))
17
+ assert_not_nil chain.apply(Document.create("http://test.host/affe.html"))
18
+ assert_nil chain.apply(Document.create("http://test.host.com/affe.html"))
19
19
  end
20
20
 
21
21
  # test default chain config
22
22
  def test_default_filterchain
23
- chain = UrlFilters::FilterChain.new(RDig.filter_chain)
24
- assert_nil chain.apply(Document.new("http://www.example.com/affe.htm"))
25
- assert_not_nil chain.apply(Document.new("http://localhost:3000/affe.html"))
26
- assert_nil chain.apply(Document.new("http://localhost.com/affe.html"))
23
+ chain = UrlFilters::FilterChain.new(RDig.filter_chain[:http])
24
+ assert_nil chain.apply(Document.create("http://www.example.com/affe.htm"))
25
+ assert_not_nil chain.apply(Document.create("http://localhost:3000/affe.html"))
26
+ assert_nil chain.apply(Document.create("http://localhost.com/affe.html"))
27
27
  end
28
28
 
29
29
  # check lookup of chain parameters from config
@@ -38,59 +38,59 @@ class UrlFilterTest < Test::Unit::TestCase
38
38
  ]
39
39
  chain = UrlFilters::FilterChain.new(cfg)
40
40
 
41
- assert_nil chain.apply(Document.new("http://test.host/affe.htm"))
42
- assert_not_nil chain.apply(Document.new("http://test.host/affe.html"))
43
- assert_nil chain.apply(Document.new("http://test.host.com/affe.html"))
41
+ assert_nil chain.apply(Document.create("http://test.host/affe.htm"))
42
+ assert_not_nil chain.apply(Document.create("http://test.host/affe.html"))
43
+ assert_nil chain.apply(Document.create("http://test.host.com/affe.html"))
44
44
  end
45
45
 
46
46
  def test_urlpattern_filter
47
47
  f = UrlFilters::UrlInclusionFilter.new(/.*\.html$/)
48
- assert_nil f.apply(Document.new("http://test.host/affe.htm"))
49
- assert_not_nil f.apply(Document.new("http://test.host/affe.html"))
48
+ assert_nil f.apply(Document.create("http://test.host/affe.htm"))
49
+ assert_not_nil f.apply(Document.create("http://test.host/affe.html"))
50
50
  f = UrlFilters::UrlExclusionFilter.new([ /.*\.html$/, /.*\.aspx/ ])
51
- assert_not_nil f.apply(Document.new("http://test.host/affe.htm"))
52
- assert_nil f.apply(Document.new("http://test.host/affe.html"))
53
- assert_nil f.apply(Document.new("http://test.host/affe.aspx"))
51
+ assert_not_nil f.apply(Document.create("http://test.host/affe.htm"))
52
+ assert_nil f.apply(Document.create("http://test.host/affe.html"))
53
+ assert_nil f.apply(Document.create("http://test.host/affe.aspx"))
54
54
  f = UrlFilters::UrlExclusionFilter.new([ /http:\/\/[^\/]+\/dir1/ ])
55
- assert_nil f.apply(Document.new("http://test.host/dir1/affe.aspx"))
56
- assert_not_nil f.apply(Document.new("http://test.host/dir2/dir1/affe.htm"))
57
- assert_not_nil f.apply(Document.new("http://test.host/affe.htm"))
58
- assert_not_nil f.apply(Document.new("http://test.host/dir2/affe.htm"))
55
+ assert_nil f.apply(Document.create("http://test.host/dir1/affe.aspx"))
56
+ assert_not_nil f.apply(Document.create("http://test.host/dir2/dir1/affe.htm"))
57
+ assert_not_nil f.apply(Document.create("http://test.host/affe.htm"))
58
+ assert_not_nil f.apply(Document.create("http://test.host/dir2/affe.htm"))
59
59
  f = UrlFilters::UrlExclusionFilter.new([ /\/dir1/ ])
60
- assert_nil f.apply(Document.new("http://test.host/dir1/affe.aspx"))
61
- assert_nil f.apply(Document.new("http://test.host/dir2/dir1/affe.htm"))
62
- assert_not_nil f.apply(Document.new("http://test.host/affe.htm"))
63
- assert_not_nil f.apply(Document.new("http://test.host/dir2/affe.htm"))
60
+ assert_nil f.apply(Document.create("http://test.host/dir1/affe.aspx"))
61
+ assert_nil f.apply(Document.create("http://test.host/dir2/dir1/affe.htm"))
62
+ assert_not_nil f.apply(Document.create("http://test.host/affe.htm"))
63
+ assert_not_nil f.apply(Document.create("http://test.host/dir2/affe.htm"))
64
64
  end
65
65
 
66
66
  def test_hostname_filter
67
67
  include_hosts = [ 'test.host', 'localhost' ]
68
- assert_nil UrlFilters.hostname_filter(Document.new('http://google.com/'), include_hosts)
69
- assert_not_nil UrlFilters.hostname_filter(Document.new('http://test.host/file.html'), include_hosts)
70
- assert_not_nil UrlFilters.hostname_filter(Document.new('http://localhost/file.html'), include_hosts)
68
+ assert_nil UrlFilters.hostname_filter(Document.create('http://google.com/'), include_hosts)
69
+ assert_not_nil UrlFilters.hostname_filter(Document.create('http://test.host/file.html'), include_hosts)
70
+ assert_not_nil UrlFilters.hostname_filter(Document.create('http://localhost/file.html'), include_hosts)
71
71
  end
72
72
 
73
73
  def test_fix_relative_uri
74
- doc = Document.new('http://test.host/dir/file.html')
74
+ doc = Document.create('http://test.host/dir/file.html')
75
75
  assert_equal('http://test.host/dir/another.html',
76
- UrlFilters.fix_relative_uri(Document.new('another.html', doc.uri)).uri.to_s)
76
+ UrlFilters.fix_relative_uri(Document.create('another.html', doc.uri)).uri.to_s)
77
77
  assert_equal('http://test.host/dir/../another.html',
78
- UrlFilters.fix_relative_uri(Document.new('../another.html', doc.uri)).uri.to_s)
78
+ UrlFilters.fix_relative_uri(Document.create('../another.html', doc.uri)).uri.to_s)
79
79
  assert_equal('http://test.host/dir/another.html',
80
- UrlFilters.fix_relative_uri(Document.new('/dir/another.html', doc.uri)).uri.to_s)
80
+ UrlFilters.fix_relative_uri(Document.create('/dir/another.html', doc.uri)).uri.to_s)
81
81
  assert_equal('http://test.host/dir/another.html',
82
- UrlFilters.fix_relative_uri(Document.new('http://test.host/dir/another.html', doc.uri)).uri.to_s)
82
+ UrlFilters.fix_relative_uri(Document.create('http://test.host/dir/another.html', doc.uri)).uri.to_s)
83
83
  assert_equal('HTTP://test.host/dir/another.html',
84
- UrlFilters.fix_relative_uri(Document.new('HTTP://test.host/dir/another.html', doc.uri)).uri.to_s)
85
- doc = Document.new('https://test.host/dir/')
84
+ UrlFilters.fix_relative_uri(Document.create('HTTP://test.host/dir/another.html', doc.uri)).uri.to_s)
85
+ doc = Document.create('https://test.host/dir/')
86
86
  assert_equal('https://test.host/dir/another.html',
87
- UrlFilters.fix_relative_uri(Document.new('another.html', doc.uri)).uri.to_s)
88
- doc = Document.new('https://test.host/')
87
+ UrlFilters.fix_relative_uri(Document.create('another.html', doc.uri)).uri.to_s)
88
+ doc = Document.create('https://test.host/')
89
89
  assert_equal('https://test.host/another.html',
90
- UrlFilters.fix_relative_uri(Document.new('another.html', doc.uri)).uri.to_s)
91
- doc = Document.new('https://test.host')
90
+ UrlFilters.fix_relative_uri(Document.create('another.html', doc.uri)).uri.to_s)
91
+ doc = Document.create('https://test.host')
92
92
  assert_equal('https://test.host/another.html',
93
- UrlFilters.fix_relative_uri(Document.new('another.html', doc.uri)).uri.to_s)
93
+ UrlFilters.fix_relative_uri(Document.create('another.html', doc.uri)).uri.to_s)
94
94
  end
95
95
  end
96
96
 
@@ -3,7 +3,7 @@ class WordContentExtractorTest < Test::Unit::TestCase
3
3
  include TestHelper
4
4
 
5
5
  def setup
6
- @ce = ContentExtractors::WordContentExtractor.new
6
+ @ce = ContentExtractors::WordContentExtractor.new(RDig.configuration.content_extraction)
7
7
  end
8
8
 
9
9
  def test_can_do
metadata CHANGED
@@ -1,10 +1,10 @@
1
1
  --- !ruby/object:Gem::Specification
2
- rubygems_version: 0.8.11
2
+ rubygems_version: 0.8.11.15
3
3
  specification_version: 1
4
4
  name: rdig
5
5
  version: !ruby/object:Gem::Version
6
- version: 0.2.1
7
- date: 2006-04-20 00:00:00 +02:00
6
+ version: 0.3.0
7
+ date: 2006-04-26 00:00:00 +02:00
8
8
  summary: Ruby based web site indexing and searching library.
9
9
  require_paths:
10
10
  - lib
@@ -25,6 +25,7 @@ required_ruby_version: !ruby/object:Gem::Version::Requirement
25
25
  platform: ruby
26
26
  signing_key:
27
27
  cert_chain:
28
+ post_install_message:
28
29
  authors:
29
30
  - Jens Kraemer
30
31
  files:
@@ -32,13 +33,14 @@ files:
32
33
  - lib/rdig
33
34
  - lib/htmlentities
34
35
  - lib/rdig.rb
35
- - lib/rdig/http_client.rb
36
36
  - lib/rdig/crawler.rb
37
37
  - lib/rdig/search.rb
38
38
  - lib/rdig/highlight.rb
39
39
  - lib/rdig/index.rb
40
40
  - lib/rdig/url_filters.rb
41
41
  - lib/rdig/content_extractors.rb
42
+ - lib/rdig/documents.rb
43
+ - lib/rdig/file.rb
42
44
  - lib/htmlentities/CHANGES
43
45
  - lib/htmlentities/COPYING
44
46
  - lib/htmlentities/README
@@ -51,6 +53,8 @@ files:
51
53
  - test/unit/html_content_extractor_test.rb
52
54
  - test/unit/pdf_content_extractor_test.rb
53
55
  - test/unit/word_content_extractor_test.rb
56
+ - test/unit/file_document_test.rb
57
+ - test/unit/crawler_fs_test.rb
54
58
  - test/fixtures/html
55
59
  - test/fixtures/pdf
56
60
  - test/fixtures/word
@@ -1,22 +0,0 @@
1
- module RDig
2
-
3
- module HttpClient
4
- def do_get(uri, user_agent='RDig crawler')
5
- # Set up the appropriate http headers
6
- headers = { "User-Agent" => user_agent }
7
- result = {}
8
-
9
- begin
10
- Net::HTTP.start(uri.host, (uri.port or 80)) { |http|
11
- final_uri = uri.path
12
- final_uri += ('?' + uri.query) if uri.query
13
- return http.get(final_uri, headers)
14
- }
15
- rescue => error
16
- puts error
17
- end
18
- end
19
- end
20
-
21
- end
22
-