rdig 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGES CHANGED
@@ -1,3 +1,17 @@
1
+ 0.3.0
2
+ - file system crawling
3
+ - optional url rewriting before indexing, e.g. for linking to results
4
+ via http and building the index directly from the file system
5
+ - PDF title extraction with pdfinfo
6
+ - removed dependency on mkmf which doesn't seem to exist in Ruby 1.8.2
7
+ - made content extractors more flexible - instances now use a given
8
+ configuration instead of the global one. This allows the
9
+ WordContentExtractor to use an HtmlContentExtractor with it's own
10
+ configuration that is independent of the global config.
11
+
12
+ 0.2.1
13
+ - Bugfix release
14
+
1
15
  0.2.0
2
16
  - add pdf and Word content extraction capabilities using the tools
3
17
  from the xpdf-utils and wv packages
@@ -1,25 +1,36 @@
1
1
  RDig.configuration do |cfg|
2
2
 
3
3
  ##################################################################
4
- # options you should really set
4
+ # options you really should set
5
5
 
6
6
  # provide one or more URLs for the crawler to start from
7
7
  cfg.crawler.start_urls = [ 'http://www.example.com/' ]
8
8
 
9
+ # use something like this for crawling a file system:
10
+ # cfg.crawler.start_urls = [ 'file:///home/bob/documents/' ]
11
+ # beware, mixing file and http crawling is not possible and might result in
12
+ # unpredictable results.
13
+
9
14
  # limit the crawl to these hosts. The crawler will never
10
15
  # follow any links pointing to hosts other than those given here.
16
+ # ignored for file system crawling
11
17
  cfg.crawler.include_hosts = [ 'www.example.com' ]
12
18
 
13
19
  # this is the path where the index will be stored
14
20
  # caution, existing contents of this directory will be deleted!
15
- cfg.ferret.path = '/path/to/index'
21
+ cfg.indexer.path = '/path/to/index'
16
22
 
17
23
  ##################################################################
18
24
  # options you might want to set, the given values are the defaults
25
+
26
+ # set to true to get stack traces on errors
27
+ # cfg.verbose = false
19
28
 
20
29
  # content extraction options
21
30
 
22
- # provide a method that selects the tag containing the title of a document
31
+ # provide a method that returns the title of an html document
32
+ # this method may either return a tag to extract the title from,
33
+ # or a ready-to-index string.
23
34
  # cfg.content_extraction.html.title_tag_selector = lambda { |tagsoup| tagsoup.html.head.title }
24
35
 
25
36
  # provide a method that selects the tag containing the page content you
@@ -29,8 +40,12 @@ RDig.configuration do |cfg|
29
40
 
30
41
  # crawler options
31
42
 
32
- # nil (index all documents) or an array of Regexps
33
- # matching URLs you want to index.
43
+ # Notice: for file system crawling the include/exclude_document patterns are
44
+ # applied to the full path of _files_ only (like /home/bob/test.pdf),
45
+ # for http to full URIs (like http://example.com/index.html).
46
+
47
+ # nil (include all documents) or an array of Regexps
48
+ # matching the URLs you want to index.
34
49
  # cfg.crawler.include_documents = nil
35
50
 
36
51
  # nil (no documents excluded) or an array of Regexps
@@ -40,14 +55,35 @@ RDig.configuration do |cfg|
40
55
  # included by the inclusion patterns.
41
56
  # cfg.crawler.exclude_documents = nil
42
57
 
43
- # number of http fetching threads to use
58
+ # number of document fetching threads to use. Should be raised only if
59
+ # your CPU has idle time when indexing.
44
60
  # cfg.crawler.num_threads = 2
61
+ # suggested setting for file system crawling:
62
+ # cfg.crawler.num_threads = 1
45
63
 
46
64
  # maximum number of http redirections to follow
47
65
  # cfg.crawler.max_redirects = 5
48
66
 
49
67
  # number of seconds to wait with an empty url queue before
50
- # finishing the crawl. Set to a higher number for slow sites
68
+ # finishing the crawl. Set to a higher number when experiencing incomplete
69
+ # crawls on slow sites. Don't set to 0, even when crawling a local fs.
51
70
  # cfg.crawler.wait_before_leave = 10
71
+
72
+ # indexer options
73
+
74
+ # create a new index on each run. Will append to the index if false. Use when
75
+ # building a single index from multiple runs, e.g. one across a website and the
76
+ # other a tree in a local file system
77
+ # config.index.create = true
78
+
79
+ # rewrite document uris before indexing them. This is useful if you're
80
+ # indexing on disk, but the documents should be accessible via http, e.g. from
81
+ # a web based search application. By default, no rewriting takes place.
82
+ # example:
83
+ # cfg.index.rewrite_uri = lambda { |uri|
84
+ # uri.path.gsub!(/^\/base\//, '/virtual_dir/')
85
+ # uri.scheme = 'http'
86
+ # uri.host = 'www.mydomain.com'
87
+ # }
52
88
 
53
89
  end
@@ -24,7 +24,7 @@
24
24
  #++
25
25
  #
26
26
 
27
- RDIGVERSION = '0.2.1'
27
+ RDIGVERSION = '0.3.0'
28
28
 
29
29
 
30
30
  require 'thread'
@@ -38,28 +38,28 @@ require 'set'
38
38
  require 'net/http'
39
39
  require 'getoptlong'
40
40
  require 'tempfile'
41
- # mkmf gives us the handy find_executable method used to check for helper
42
- # programs:
43
- require 'mkmf'
41
+ require 'open-uri'
44
42
 
45
43
  begin
46
- require 'rubyful_soup'
47
44
  require 'ferret'
45
+ require 'rubyful_soup'
48
46
  rescue LoadError
49
47
  require 'rubygems'
50
- require 'rubyful_soup'
51
48
  require 'ferret'
49
+ require 'rubyful_soup'
52
50
  end
53
51
 
54
52
  require 'htmlentities/htmlentities'
55
-
56
- require 'rdig/http_client'
53
+
57
54
  require 'rdig/content_extractors'
58
55
  require 'rdig/url_filters'
59
56
  require 'rdig/search'
60
57
  require 'rdig/index'
58
+ require 'rdig/file'
59
+ require 'rdig/documents'
61
60
  require 'rdig/crawler'
62
61
 
62
+
63
63
  $KCODE = 'u'
64
64
  require 'jcode'
65
65
 
@@ -68,17 +68,30 @@ module RDig
68
68
 
69
69
  class << self
70
70
 
71
- # the filter chain each URL has to run through before being crawled.
71
+ # the filter chains are for limiting the set of indexed documents.
72
+ # there are two chain types - one for http, and one for file system
73
+ # crawling.
74
+ # a document has to survive all filters in the chain to get indexed.
72
75
  def filter_chain
73
- @filter_chain ||= [
74
- { :maximum_redirect_filter => :max_redirects },
75
- :fix_relative_uri,
76
- :normalize_uri,
77
- { :hostname_filter => :include_hosts },
78
- { RDig::UrlFilters::UrlInclusionFilter => :include_documents },
79
- { RDig::UrlFilters::UrlExclusionFilter => :exclude_documents },
80
- RDig::UrlFilters::VisitedUrlFilter
81
- ]
76
+ @filter_chain ||= {
77
+ # filter chain for http crawling
78
+ :http => [
79
+ :scheme_filter_http,
80
+ :fix_relative_uri,
81
+ :normalize_uri,
82
+ { :hostname_filter => :include_hosts },
83
+ { RDig::UrlFilters::UrlInclusionFilter => :include_documents },
84
+ { RDig::UrlFilters::UrlExclusionFilter => :exclude_documents },
85
+ RDig::UrlFilters::VisitedUrlFilter
86
+ ],
87
+ # filter chain for file system crawling
88
+ :file => [
89
+ :scheme_filter_file,
90
+ { RDig::UrlFilters::PathInclusionFilter => :include_documents },
91
+ { RDig::UrlFilters::PathExclusionFilter => :exclude_documents }
92
+ ]
93
+ }
94
+
82
95
  end
83
96
 
84
97
  def application
@@ -86,7 +99,7 @@ module RDig
86
99
  end
87
100
 
88
101
  def searcher
89
- @searcher ||= Search::Searcher.new(config.ferret)
102
+ @searcher ||= Search::Searcher.new(config.index)
90
103
  end
91
104
 
92
105
  # RDig configuration
@@ -124,7 +137,7 @@ module RDig
124
137
  }
125
138
  )
126
139
  ),
127
- :ferret => OpenStruct.new(
140
+ :index => OpenStruct.new(
128
141
  :path => "index/",
129
142
  :create => true,
130
143
  :handle_parse_errors => true,
@@ -224,6 +237,8 @@ module RDig
224
237
 
225
238
  end
226
239
 
240
+ puts "using Ferret #{Ferret::VERSION}"
241
+
227
242
  if options.query
228
243
  # query the index
229
244
  puts "executing query >#{options.query}<"
@@ -54,7 +54,9 @@ module RDig
54
54
 
55
55
  def self.extractors; @@extractors ||= [] end
56
56
  def self.extractor_instances
57
- @@extractor_instances ||= extractors.map { |ex_class| ex_class.new }
57
+ @@extractor_instances ||= extractors.map { |ex_class|
58
+ ex_class.new(RDig.configuration.content_extraction)
59
+ }
58
60
  end
59
61
 
60
62
  def self.process(content, content_type)
@@ -65,6 +67,10 @@ module RDig
65
67
  nil
66
68
  end
67
69
 
70
+ def initialize(config)
71
+ @config = config
72
+ end
73
+
68
74
  def can_do(content_type)
69
75
  content_type =~ @pattern
70
76
  end
@@ -91,60 +97,88 @@ module RDig
91
97
  file.delete
92
98
  end
93
99
 
94
- def available
95
- if @available.nil?
96
- @available = !find_executable(@executable).nil?
97
- end
98
- @available
99
- end
100
-
100
+ # setting @available according to presence of external executables
101
+ # in initializer of ContentExtractor is needed to make this work
101
102
  def can_do(content_type)
102
- available and super(content_type)
103
+ @available and super(content_type)
103
104
  end
104
105
  end
105
106
 
106
107
  # Extract text from pdf content.
107
108
  #
108
- # Requires the pdftotext utility from the xpdf-utils package
109
+ # Requires the pdftotext and pdfinfo utilities from the
110
+ # xpdf-utils package
109
111
  # (on debian and friends do 'apt-get install xpdf-utils')
110
112
  #
111
- # TODO: use pdfinfo to get title from document
112
113
  class PdfContentExtractor < ContentExtractor
113
114
  include ExternalAppHelper
114
115
 
115
- def initialize
116
- @executable = 'pdftotext'
116
+ def initialize(config)
117
+ super(config)
117
118
  @pattern = /^application\/pdf/
119
+ @pdftotext = 'pdftotext'
120
+ @pdfinfo = 'pdfinfo'
121
+ @available = true
122
+ [ @pdftotext, @pdfinfo].each { |program|
123
+ unless %x{#{program} -h 2>&1} =~ /Copyright 1996/
124
+ @available = false
125
+ break
126
+ end
127
+ }
118
128
  end
119
-
129
+
130
+ def process(content)
131
+ result = {}
132
+ as_file(content) do |file|
133
+ result[:content] = get_content(file.path).strip
134
+ result[:title] = get_title(file.path)
135
+ end
136
+ result
137
+ end
138
+
120
139
  def get_content(path_to_tempfile)
121
- %x{#{@executable} '#{path_to_tempfile}' -}
140
+ %x{#{@pdftotext} -enc UTF-8 '#{path_to_tempfile}' -}
141
+ end
142
+
143
+ # extracts the title from pdf meta data
144
+ # needs pdfinfo
145
+ # returns the title or nil if no title was found
146
+ def get_title(path_to_tempfile)
147
+ %x{#{@pdfinfo} -enc UTF-8 '#{path_to_tempfile}'} =~ /title:\s+(.*)$/i ? $1.strip : nil
148
+ rescue
122
149
  end
123
150
  end
124
151
 
125
152
  # Extract text from word documents
126
153
  #
127
- # Requires the antiword utility
128
- # (on debian and friends do 'apt-get install antiword')
154
+ # Requires the wvHtml utility
155
+ # (on debian and friends do 'apt-get install wv')
129
156
  class WordContentExtractor < ContentExtractor
130
157
  include ExternalAppHelper
131
158
 
132
- def initialize
133
- @executable = 'wvHtml'
159
+ def initialize(config)
160
+ super(config)
161
+ @wvhtml = 'wvHtml'
134
162
  @pattern = /^application\/msword/
135
- @html_extractor = HtmlContentExtractor.new
163
+ # html extractor for parsing wvHtml output
164
+ @html_extractor = HtmlContentExtractor.new(OpenStruct.new(
165
+ :html => OpenStruct.new(
166
+ :content_tag_selector => lambda { |tagsoup|
167
+ tagsoup.html.body
168
+ },
169
+ :title_tag_selector => lambda { |tagsoup|
170
+ tagsoup.html.head.title
171
+ }
172
+ )))
173
+
174
+ # TODO: besser: if $?.exitstatus == 127 (not found)
175
+ @available = %x{#{@wvhtml} -h 2>&1} =~ /Dom Lachowicz/
136
176
  end
137
177
 
138
178
  def process(content)
139
179
  result = {}
140
- as_file(content) do |infile|
141
- outfile = Tempfile.new('rdig')
142
- outfile.close
143
- %x{#{@executable} --targetdir='#{File.dirname(outfile.path)}' '#{infile.path}' '#{File.basename(outfile.path)}'}
144
- File.open(outfile.path) do |html|
145
- result = @html_extractor.process(html.read)
146
- end
147
- outfile.delete
180
+ as_file(content) do |file|
181
+ result = @html_extractor.process(%x{#{@wvhtml} --charset=UTF-8 '#{file.path}' -})
148
182
  end
149
183
  return result || {}
150
184
  end
@@ -154,7 +188,8 @@ module RDig
154
188
  # extracts title, content and links from html documents
155
189
  class HtmlContentExtractor < ContentExtractor
156
190
 
157
- def initialize
191
+ def initialize(config)
192
+ super(config)
158
193
  @pattern = /^(text\/(html|xml)|application\/(xhtml\+xml|xml))/
159
194
  end
160
195
 
@@ -181,9 +216,10 @@ module RDig
181
216
  # children.
182
217
  def extract_content(tag_soup)
183
218
  content = ''
184
- content_element(tag_soup).children { |child|
219
+ ce = content_element(tag_soup)
220
+ ce.children { |child|
185
221
  extract_text(child, content)
186
- }
222
+ } unless ce.nil?
187
223
  return content.strip
188
224
  end
189
225
 
@@ -197,18 +233,20 @@ module RDig
197
233
 
198
234
  # Extracts the title from the given html tree
199
235
  def extract_title(tagsoup)
200
- title = ''
201
236
  the_title_tag = title_tag(tagsoup)
202
237
  if the_title_tag.is_a? String
203
238
  the_title_tag
204
239
  else
205
- extract_text(the_title_tag).strip if the_title_tag
240
+ title = ''
241
+ extract_text(the_title_tag, title)
242
+ title.strip
206
243
  end
207
244
  end
208
245
 
209
246
  # Recursively extracts all text contained in the given element,
210
247
  # and appends it to content.
211
248
  def extract_text(element, content='')
249
+ return nil if element.nil?
212
250
  if element.is_a? NavigableString
213
251
  value = strip_comments(element)
214
252
  value.strip!
@@ -234,8 +272,8 @@ module RDig
234
272
  # This may return a string, e.g. an attribute value selected from a meta
235
273
  # tag, too.
236
274
  def title_tag(tagsoup)
237
- if RDig.config.content_extraction.html.title_tag_selector
238
- RDig.config.content_extraction.html.title_tag_selector.call(tagsoup)
275
+ if @config.html.title_tag_selector
276
+ @config.html.title_tag_selector.call(tagsoup)
239
277
  else
240
278
  tagsoup.html.head.title
241
279
  end
@@ -243,8 +281,8 @@ module RDig
243
281
 
244
282
  # Retrieve the root element to extract document content from
245
283
  def content_element(tagsoup)
246
- if RDig.config.content_extraction.html.content_tag_selector
247
- RDig.config.content_extraction.html.content_tag_selector.call(tagsoup)
284
+ if @config.html.content_tag_selector
285
+ @config.html.content_tag_selector.call(tagsoup)
248
286
  else
249
287
  tagsoup.html.body
250
288
  end
@@ -9,30 +9,28 @@ module RDig
9
9
  end
10
10
 
11
11
  def run
12
- @indexer = Index::Indexer.new(RDig.config.ferret)
13
- filterchain = UrlFilters::FilterChain.new(RDig.filter_chain)
12
+ raise 'no start urls given!' if RDig.config.crawler.start_urls.empty?
13
+ @indexer = Index::Indexer.new(RDig.config.index)
14
+
15
+ # check whether we are indexing on-disk or via http
16
+ url_type = RDig.config.crawler.start_urls.first =~ /^file:\/\// ? :file : :http
17
+ chain_config = RDig.filter_chain[url_type]
18
+
19
+ filterchain = UrlFilters::FilterChain.new(chain_config)
14
20
  RDig.config.crawler.start_urls.each { |url| add_url(url, filterchain) }
15
21
 
16
22
  num_threads = RDig.config.crawler.num_threads
17
23
  group = ThreadsWait.new
18
24
  num_threads.times { |i|
19
25
  group.join_nowait Thread.new("fetcher #{i}") {
20
- filterchain = UrlFilters::FilterChain.new(RDig.filter_chain)
26
+ filterchain = UrlFilters::FilterChain.new(chain_config)
21
27
  while (doc = @documents.pop) != :exit
22
28
  process_document doc, filterchain
23
29
  end
24
30
  }
25
31
  }
26
32
 
27
- # dilemma: suppose we have 1 start url and two threads t1 and t2:
28
- # t1 pops the start url from the queue which now is empty
29
- # as the queue is empty now, t2 blocks until t1 adds the links
30
- # retrieved from his document.
31
- #
32
- # But we need the 'queue empty' condition as a sign for us to stop
33
- # waiting for new entries, too.
34
-
35
- # check every now and then for an empty queue
33
+ # check for an empty queue every now and then
36
34
  sleep_interval = RDig.config.crawler.wait_before_leave
37
35
  begin
38
36
  sleep sleep_interval
@@ -54,22 +52,10 @@ module RDig
54
52
  } unless doc.content[:links].nil?
55
53
 
56
54
  return unless @etag_filter.apply(doc)
57
- case doc.status
58
- when :success
59
- if doc.content
60
- if doc.content[:links]
61
- doc.content[:links].each { |url| add_url(url, filterchain, doc) }
62
- end
63
- @indexer << doc
64
- #else
65
- #puts "success but no content: #{doc.uri.to_s}"
66
- end
67
- when :redirect
68
- # links contains the url we were redirected to
69
- doc.content[:links].each { |url| add_url(url, filterchain, doc) }
70
- end
55
+ @indexer << doc if doc.needs_indexing?
71
56
  rescue
72
57
  puts "error processing document #{doc.uri.to_s}: #{$!}"
58
+ puts "Trace: #{$!.backtrace.join("\n")}" if RDig::config.verbose
73
59
  end
74
60
 
75
61
 
@@ -78,82 +64,23 @@ module RDig
78
64
  # processing
79
65
  def add_url(url, filterchain, referring_document = nil)
80
66
  return if url.nil? || url.empty?
81
- if referring_document
82
- doc = Document.new(url, referring_document.uri)
83
- # keep redirect count
84
- if referring_document.status == :redirect
85
- doc.redirections = referring_document.redirections + 1
86
- end
67
+ if referring_document and referring_document.uri.scheme =~ /^https?/i
68
+ doc = Document.create(url, referring_document.uri)
87
69
  else
88
- doc = Document.new(url)
70
+ doc = Document.create(url)
89
71
  end
90
72
 
91
73
  doc = filterchain.apply(doc)
92
74
 
93
75
  if doc
94
- puts "added url #{url}"
95
- #else
96
- #puts "skipping url #{url}"
76
+ @documents << doc
77
+ puts "added url #{url}" if RDig::config.verbose
97
78
  end
98
- @documents << doc if doc
99
79
  end
100
80
 
101
81
  end
102
82
 
103
83
 
104
- class Document
105
- include HttpClient
106
-
107
- attr_reader :content
108
- attr_reader :content_type
109
- attr_reader :uri
110
- attr_reader :referring_uri
111
- attr_reader :status
112
- attr_reader :etag
113
- attr_accessor :redirections
114
-
115
- # url: url of this document, may be relative to the referring doc or host.
116
- # referrer: uri of the document we retrieved this link from
117
- def initialize(url, referrer = nil)
118
- @redirections = 0
119
- begin
120
- @uri = URI.parse(url)
121
- rescue URI::InvalidURIError
122
- raise "Cannot create document using invalid URL: #{url}"
123
- end
124
- @referring_uri = referrer
125
- end
126
-
127
- def has_content?
128
- !self.content.nil?
129
- end
130
-
131
- def title; @content[:title] end
132
- def body; @content[:content] end
133
- def url; @uri.to_s end
134
-
135
- def fetch
136
- puts "fetching #{@uri.to_s}"
137
- response = do_get(@uri)
138
- case response
139
- when Net::HTTPSuccess
140
- @content_type = response['content-type']
141
- @raw_body = response.body
142
- @etag = response['etag']
143
- # todo externalize this (another chain ?)
144
- @content = ContentExtractors.process(@raw_body, @content_type)
145
- @status = :success
146
- when Net::HTTPRedirection
147
- @status = :redirect
148
- @content = { :links => [ response['location'] ] }
149
- else
150
- puts "don't know what to do with response: #{response}"
151
- end
152
-
153
- end
154
-
155
- end
156
-
157
84
  # checks fetched documents' E-Tag headers against the list of E-Tags
158
85
  # of the documents already indexed.
159
86
  # This is supposed to help against double-indexing documents which can
@@ -169,7 +96,7 @@ module RDig
169
96
  end
170
97
 
171
98
  def apply(document)
172
- return document unless document.etag
99
+ return document unless (document.respond_to?(:etag) && document.etag)
173
100
  synchronize do
174
101
  @etags.add?(document.etag) ? document : nil
175
102
  end
@@ -0,0 +1,133 @@
1
+ module RDig
2
+
3
+ #
4
+ # Document base class
5
+ #
6
+ class Document
7
+
8
+ attr_reader :uri
9
+ attr_reader :content
10
+ attr_reader :content_type
11
+
12
+ def self.create(url, referrer_uri = nil)
13
+ # a referrer is a clear enough hint to create an HttpDocument
14
+ if referrer_uri && referrer_uri.scheme =~ /^https?$/i
15
+ return HttpDocument.new(:url => url, :referrer => referrer_uri)
16
+ end
17
+
18
+ case url
19
+ when /^https?:\/\//i
20
+ HttpDocument.new(:url => url, :referrer => referrer_uri) if referrer_uri.nil?
21
+ when /^file:\/\//i
22
+ # files don't have referrers - the check for nil prevents us from being
23
+ # tricked into indexing local files by file:// links in the web site
24
+ # we index.
25
+ FileDocument.new(:url => url) if referrer_uri.nil?
26
+ end
27
+ end
28
+
29
+ # url: url of this document, may be relative to the referring doc or host.
30
+ # referrer: uri of the document we retrieved this link from
31
+ def initialize(args)
32
+ begin
33
+ @uri = URI.parse(args[:url])
34
+ rescue URI::InvalidURIError
35
+ raise "Cannot create document using invalid URL: #{url}"
36
+ end
37
+ end
38
+
39
+ def title; @content[:title] end
40
+ def body; @content[:content] end
41
+ def links; @content[:links] end
42
+
43
+ def needs_indexing?
44
+ has_content? && (title || body)
45
+ end
46
+
47
+ def has_content?
48
+ !self.content.nil?
49
+ end
50
+
51
+ end
52
+
53
+
54
+ #
55
+ # Document in a File system
56
+ #
57
+ class FileDocument < Document
58
+ def initialize(args={})
59
+ super(args)
60
+ end
61
+
62
+ def self.find_files(path)
63
+ links = []
64
+ Dir.glob(File.expand_path(File.join(path, '*'))) do |filename|
65
+ # Skip files not matching known mime types
66
+ pattern = /.+\.(#{File::FILE_EXTENSION_MIME_TYPES.keys.join('|')})$/i
67
+ if File.directory?(filename) || filename =~ pattern
68
+ links << "file://#{filename}"
69
+ end
70
+ end
71
+ links
72
+ end
73
+
74
+ def file?
75
+ File.file? @uri.path
76
+ end
77
+
78
+ def fetch
79
+ if File.directory? @uri.path
80
+ # directories are treated like a link collection
81
+ @content = { :links => self.class.find_files(@uri.path) }
82
+ else
83
+ # process this file's contents
84
+ open(@uri.path) do |file|
85
+ @content = ContentExtractors.process(file.read, file.content_type)
86
+ @content[:links] = nil if @content # don't follow links inside files
87
+ end
88
+ end
89
+ @content ||= {}
90
+ end
91
+
92
+ end
93
+
94
+
95
+ #
96
+ # Remote Document to be retrieved by HTTP
97
+ #
98
+ class HttpDocument < Document
99
+
100
+ attr_reader :referring_uri
101
+ attr_reader :status
102
+ attr_reader :etag
103
+
104
+ # url: url of this document, may be relative to the referring doc or host.
105
+ # referrer: uri of the document we retrieved this link from
106
+ def initialize(args={})
107
+ super(args)
108
+ @referring_uri = args[:referrer]
109
+ end
110
+
111
+ def fetch
112
+ puts "fetching #{@uri.to_s}" if RDig::config.verbose
113
+ open(@uri.to_s) do |doc|
114
+ case doc.status.first.to_i
115
+ when 200
116
+ @etag = doc.meta['etag']
117
+ # puts "etag: #{@etag}"
118
+ @content = ContentExtractors.process(doc.read, doc.content_type)
119
+ @status = :success
120
+ when 404
121
+ puts "got 404 for #{url}"
122
+ else
123
+ puts "don't know what to do with response: #{doc.status.join(' : ')}"
124
+ end
125
+ end
126
+ rescue
127
+ puts "error fetching #{@uri.to_s}: #{$!}" if RDig::config.verbose
128
+ ensure
129
+ @content ||= {}
130
+ end
131
+
132
+ end
133
+ end
@@ -0,0 +1,18 @@
1
+ # Extend class File with a content_type method
2
+ class File
3
+
4
+ # mime types and file extensions
5
+ FILE_EXTENSION_MIME_TYPES = {
6
+ 'doc' => 'application/msword',
7
+ 'html' => 'text/html',
8
+ 'htm' => 'text/html',
9
+ #'.odt' => 'application/vnd.oasis.opendocument.text',
10
+ 'pdf' => 'application/pdf',
11
+ 'txt' => 'text/plain',
12
+ }
13
+
14
+ def content_type
15
+ FILE_EXTENSION_MIME_TYPES[File.extname(self.path).downcase.gsub(/^\./,'')] || 'application/octet-stream'
16
+ end
17
+
18
+ end
@@ -6,7 +6,7 @@ module RDig
6
6
  include MonitorMixin, Ferret::Index, Ferret::Document
7
7
 
8
8
  def initialize(settings)
9
- #@ferret_config = settings
9
+ @config = settings
10
10
  @index_writer = IndexWriter.new(settings.path,
11
11
  :create => settings.create,
12
12
  :analyzer => settings.analyzer)
@@ -14,10 +14,12 @@ module RDig
14
14
  end
15
15
 
16
16
  def add_to_index(document)
17
- puts "add to index: #{document.uri.to_s}"
17
+ puts "add to index: #{document.uri.to_s}" if RDig::config.verbose
18
18
  doc = Ferret::Document::Document.new
19
- doc << Field.new("url", document.url,
20
- Field::Store::YES, Field::Index::UNTOKENIZED)
19
+ @config.rewrite_uri.call(document.uri) if @config.rewrite_uri
20
+
21
+ doc << Field.new("url", document.uri.to_s,
22
+ Field::Store::YES, Field::Index::TOKENIZED)
21
23
  doc << Field.new("title", document.title,
22
24
  Field::Store::YES, Field::Index::TOKENIZED)
23
25
  doc << Field.new("data", document.body,
@@ -82,7 +82,7 @@ module RDig
82
82
 
83
83
 
84
84
  # base class for url inclusion / exclusion filters
85
- class UrlPatternFilter
85
+ class PatternFilter
86
86
  # takes an Array of Regexps, or nil to disable the filter
87
87
  def initialize(args=nil)
88
88
  unless args.nil?
@@ -98,8 +98,8 @@ module RDig
98
98
  end
99
99
  end
100
100
  end
101
- class UrlExclusionFilter < UrlPatternFilter
102
- # returns nil if any of the patterns matches it's URL,
101
+ class UrlExclusionFilter < PatternFilter
102
+ # returns nil if any of the patterns matches it's URI,
103
103
  # the document itself otherwise
104
104
  def apply(document)
105
105
  return document unless @patterns
@@ -109,9 +109,9 @@ module RDig
109
109
  return document
110
110
  end
111
111
  end
112
- class UrlInclusionFilter < UrlPatternFilter
113
- # returns nil if any of the patterns matches it's URL,
114
- # the document itself otherwise
112
+ class UrlInclusionFilter < PatternFilter
113
+ # returns the document if any of the patterns matches it's URI,
114
+ # nil otherwise
115
115
  def apply(document)
116
116
  return document unless @patterns
117
117
  @patterns.each { |p|
@@ -121,21 +121,42 @@ module RDig
121
121
  end
122
122
  end
123
123
 
124
-
124
+ # returns nil if any of the patterns matches it's path,
125
+ # the document itself otherwise. Applied to real files only.
126
+ class PathExclusionFilter < PatternFilter
127
+ def apply(document)
128
+ return document unless (@patterns && document.file?)
129
+ @patterns.each { |p|
130
+ return nil if document.uri.path =~ p
131
+ }
132
+ return document
133
+ end
134
+ end
135
+ # returns the document if any of the patterns matches it's path,
136
+ # nil otherwise. Applied to real files only
137
+ class PathInclusionFilter < PatternFilter
138
+ def apply(document)
139
+ return document unless (@patterns && document.file?)
140
+ @patterns.each { |p|
141
+ return document if document.uri.path =~ p
142
+ }
143
+ return nil
144
+ end
145
+ end
125
146
 
126
147
 
127
148
  # checks redirect count of the given document
128
149
  # takes it out of the chain if number of redirections exceeds the
129
150
  # max_redirects setting
130
151
  def UrlFilters.maximum_redirect_filter(document, max_redirects)
131
- return nil if document.redirections > max_redirects
152
+ return nil if document.respond_to?(:redirections) && document.redirections > max_redirects
132
153
  return document
133
154
  end
134
155
 
135
156
  # expands both href="/path/xyz.html" and href="affe.html"
136
157
  # to full urls
137
158
  def UrlFilters.fix_relative_uri(document)
138
- return nil unless document.uri.scheme.nil? || document.uri.scheme =~ /^http/i
159
+ #return nil unless document.uri.scheme.nil? || document.uri.scheme =~ /^https?/i
139
160
  ref = document.referring_uri
140
161
  return document unless ref
141
162
  uri = document.uri
@@ -150,6 +171,9 @@ module RDig
150
171
  uri.path = ref_path[0..ref_path.rindex('/')] + uri.path
151
172
  end
152
173
  return document
174
+ rescue
175
+ p document
176
+ p document.uri
153
177
  end
154
178
 
155
179
  def UrlFilters.hostname_filter(document, include_hosts)
@@ -167,5 +191,14 @@ module RDig
167
191
  return document
168
192
  end
169
193
 
194
+ def UrlFilters.scheme_filter_file(document)
195
+ return document if (document.uri.scheme.nil? || document.uri.scheme =~ /^file$/i)
196
+ nil
197
+ end
198
+ def UrlFilters.scheme_filter_http(document)
199
+ return document if (document.uri.scheme.nil? || document.uri.scheme =~ /^https?$/i)
200
+ nil
201
+ end
202
+
170
203
  end
171
204
  end
Binary file
@@ -0,0 +1,32 @@
1
+ require 'test_helper'
2
+ class CrawlerFsTest < Test::Unit::TestCase
3
+ include TestHelper
4
+
5
+ def setup
6
+ @fixture_path = File.expand_path(File.join(File.dirname(__FILE__), '../fixtures/'))
7
+ index_dir = 'tmp/test-index'
8
+ Dir.mkdir index_dir unless File.directory? index_dir
9
+ RDig.configuration do |cfg|
10
+ @old_crawler_cfg = cfg.crawler.clone
11
+ cfg.crawler.start_urls = [ "file://#{@fixture_path}" ]
12
+ cfg.crawler.num_threads = 1
13
+ cfg.crawler.wait_before_leave = 1
14
+ cfg.index.path = index_dir
15
+ cfg.verbose = true
16
+ end
17
+ end
18
+
19
+ def teardown
20
+ RDig.configuration do |cfg|
21
+ cfg.crawler = @old_crawler_cfg
22
+ end
23
+ end
24
+
25
+ def test_crawl
26
+ crawler = Crawler.new
27
+ crawler.run
28
+ end
29
+
30
+ end
31
+
32
+
@@ -0,0 +1,34 @@
1
+ require 'test_helper'
2
+ class FileDocumentTest < Test::Unit::TestCase
3
+ include TestHelper
4
+
5
+ def setup
6
+ @fixture_path = File.join(File.expand_path(File.dirname(__FILE__)), '../fixtures/')
7
+ end
8
+
9
+ def test_find_files
10
+ links = FileDocument.find_files(@fixture_path)
11
+ assert_equal 3, links.size
12
+ links = FileDocument.find_files("#{@fixture_path}/html")
13
+ assert_equal 3, links.size
14
+ end
15
+
16
+ def test_fetch_directory
17
+ dir = Document.create("file://#{@fixture_path}")
18
+ dir.fetch
19
+ assert_equal 3, dir.links.size
20
+ dir = Document.create("file://#{@fixture_path}/pdf")
21
+ dir.fetch
22
+ assert_equal 1, dir.links.size
23
+ end
24
+
25
+ def test_fetch_content
26
+ file = Document.create("file://#{@fixture_path}/pdf/simple.pdf")
27
+ file.fetch
28
+ assert file.needs_indexing?
29
+ assert_equal 'This is for testing PDF extraction. Some Ümläuts and a €uro. Another Paragraph.', file.body
30
+ end
31
+
32
+ end
33
+
34
+
@@ -3,13 +3,9 @@ class HtmlContentExtractorTest < Test::Unit::TestCase
3
3
  include TestHelper
4
4
 
5
5
  def setup
6
- @extractor = ContentExtractors::HtmlContentExtractor.new
6
+ @config = OpenStruct.new(:html => RDig.config.content_extraction.html.clone)
7
+ @extractor = ContentExtractors::HtmlContentExtractor.new(@config)
7
8
  @nbsp = [160].pack('U') # non breaking space
8
- @config_backup = RDig.config.content_extraction.html.clone
9
- end
10
-
11
- def teardown
12
- RDig.config.content_extraction.html = @config_backup
13
9
  end
14
10
 
15
11
  def test_can_do
@@ -41,13 +37,11 @@ class HtmlContentExtractorTest < Test::Unit::TestCase
41
37
  end
42
38
 
43
39
  def test_custom_content_element
44
- RDig.configuration do |config|
45
- config.content_extraction.html.title_tag_selector = lambda do |tagsoup|
46
- tagsoup.find('h1', :attrs => { 'class', 'title' })
47
- end
48
- config.content_extraction.html.content_tag_selector = lambda do |tagsoup|
49
- tagsoup.find('div', :attrs => { 'id', 'content' })
50
- end
40
+ @config.html.title_tag_selector = lambda do |tagsoup|
41
+ tagsoup.find('h1', :attrs => { 'class', 'title' })
42
+ end
43
+ @config.html.content_tag_selector = lambda do |tagsoup|
44
+ tagsoup.find('div', :attrs => { 'id', 'content' })
51
45
  end
52
46
  result = @extractor.process(html_doc('custom_tag_selectors'))
53
47
  assert_equal 'Sample Title in h1', result[:title]
@@ -61,23 +55,19 @@ class HtmlContentExtractorTest < Test::Unit::TestCase
61
55
 
62
56
 
63
57
  def test_title_from_dcmeta
64
- RDig.configuration do |config|
65
- config.content_extraction.html.title_tag_selector = lambda do |tagsoup|
66
- tagsoup.find('meta', :attrs => { 'name', 'DC.title' })['content']
67
- end
58
+ @config.html.title_tag_selector = lambda do |tagsoup|
59
+ tagsoup.find('meta', :attrs => { 'name', 'DC.title' })['content']
68
60
  end
69
61
  result = @extractor.process(html_doc('custom_tag_selectors'))
70
62
  assert_equal 'Title from DC meta data', result[:title]
71
63
  end
72
64
 
73
65
  def test_preprocessed_title
74
- RDig.configuration do |config|
75
- config.content_extraction.html.title_tag_selector = lambda do |tagsoup|
76
- title = tagsoup.find('meta', :attrs => { 'name', 'DC.title' })['content']
77
- # use only a portion of the title tag's contents if it matches our
78
- # regexp:
79
- title =~ /^(.*)meta data$/ ? $1.strip : title.strip
80
- end
66
+ @config.html.title_tag_selector = lambda do |tagsoup|
67
+ title = tagsoup.find('meta', :attrs => { 'name', 'DC.title' })['content']
68
+ # use only a portion of the title tag's contents if it matches our
69
+ # regexp:
70
+ title =~ /^(.*)meta data$/ ? $1.strip : title.strip
81
71
  end
82
72
  result = @extractor.process(html_doc('custom_tag_selectors'))
83
73
  assert_equal 'Title from DC', result[:title]
@@ -3,7 +3,7 @@ class PdfContentExtractorTest < Test::Unit::TestCase
3
3
  include TestHelper
4
4
 
5
5
  def setup
6
- @ce = ContentExtractors::PdfContentExtractor.new
6
+ @ce = ContentExtractors::PdfContentExtractor.new(RDig.configuration.content_extraction)
7
7
  end
8
8
 
9
9
  def test_can_do
@@ -23,10 +23,10 @@ class PdfContentExtractorTest < Test::Unit::TestCase
23
23
  private
24
24
  def check_content(result)
25
25
  assert_not_nil result
26
- assert_nil result[:title]
26
+ assert_equal 'PDF Test', result[:title]
27
27
  assert_nil result[:links]
28
28
  assert_not_nil result[:content]
29
- assert_equal 'This is for testing PDF extraction. Another Paragraph.', result[:content]
29
+ assert_equal 'This is for testing PDF extraction. Some Ümläuts and a €uro. Another Paragraph.', result[:content]
30
30
  end
31
31
 
32
32
  end
@@ -13,17 +13,17 @@ class UrlFilterTest < Test::Unit::TestCase
13
13
  ]
14
14
  chain = UrlFilters::FilterChain.new(cfg)
15
15
 
16
- assert_nil chain.apply(Document.new("http://test.host/affe.htm"))
17
- assert_not_nil chain.apply(Document.new("http://test.host/affe.html"))
18
- assert_nil chain.apply(Document.new("http://test.host.com/affe.html"))
16
+ assert_nil chain.apply(Document.create("http://test.host/affe.htm"))
17
+ assert_not_nil chain.apply(Document.create("http://test.host/affe.html"))
18
+ assert_nil chain.apply(Document.create("http://test.host.com/affe.html"))
19
19
  end
20
20
 
21
21
  # test default chain config
22
22
  def test_default_filterchain
23
- chain = UrlFilters::FilterChain.new(RDig.filter_chain)
24
- assert_nil chain.apply(Document.new("http://www.example.com/affe.htm"))
25
- assert_not_nil chain.apply(Document.new("http://localhost:3000/affe.html"))
26
- assert_nil chain.apply(Document.new("http://localhost.com/affe.html"))
23
+ chain = UrlFilters::FilterChain.new(RDig.filter_chain[:http])
24
+ assert_nil chain.apply(Document.create("http://www.example.com/affe.htm"))
25
+ assert_not_nil chain.apply(Document.create("http://localhost:3000/affe.html"))
26
+ assert_nil chain.apply(Document.create("http://localhost.com/affe.html"))
27
27
  end
28
28
 
29
29
  # check lookup of chain parameters from config
@@ -38,59 +38,59 @@ class UrlFilterTest < Test::Unit::TestCase
38
38
  ]
39
39
  chain = UrlFilters::FilterChain.new(cfg)
40
40
 
41
- assert_nil chain.apply(Document.new("http://test.host/affe.htm"))
42
- assert_not_nil chain.apply(Document.new("http://test.host/affe.html"))
43
- assert_nil chain.apply(Document.new("http://test.host.com/affe.html"))
41
+ assert_nil chain.apply(Document.create("http://test.host/affe.htm"))
42
+ assert_not_nil chain.apply(Document.create("http://test.host/affe.html"))
43
+ assert_nil chain.apply(Document.create("http://test.host.com/affe.html"))
44
44
  end
45
45
 
46
46
  def test_urlpattern_filter
47
47
  f = UrlFilters::UrlInclusionFilter.new(/.*\.html$/)
48
- assert_nil f.apply(Document.new("http://test.host/affe.htm"))
49
- assert_not_nil f.apply(Document.new("http://test.host/affe.html"))
48
+ assert_nil f.apply(Document.create("http://test.host/affe.htm"))
49
+ assert_not_nil f.apply(Document.create("http://test.host/affe.html"))
50
50
  f = UrlFilters::UrlExclusionFilter.new([ /.*\.html$/, /.*\.aspx/ ])
51
- assert_not_nil f.apply(Document.new("http://test.host/affe.htm"))
52
- assert_nil f.apply(Document.new("http://test.host/affe.html"))
53
- assert_nil f.apply(Document.new("http://test.host/affe.aspx"))
51
+ assert_not_nil f.apply(Document.create("http://test.host/affe.htm"))
52
+ assert_nil f.apply(Document.create("http://test.host/affe.html"))
53
+ assert_nil f.apply(Document.create("http://test.host/affe.aspx"))
54
54
  f = UrlFilters::UrlExclusionFilter.new([ /http:\/\/[^\/]+\/dir1/ ])
55
- assert_nil f.apply(Document.new("http://test.host/dir1/affe.aspx"))
56
- assert_not_nil f.apply(Document.new("http://test.host/dir2/dir1/affe.htm"))
57
- assert_not_nil f.apply(Document.new("http://test.host/affe.htm"))
58
- assert_not_nil f.apply(Document.new("http://test.host/dir2/affe.htm"))
55
+ assert_nil f.apply(Document.create("http://test.host/dir1/affe.aspx"))
56
+ assert_not_nil f.apply(Document.create("http://test.host/dir2/dir1/affe.htm"))
57
+ assert_not_nil f.apply(Document.create("http://test.host/affe.htm"))
58
+ assert_not_nil f.apply(Document.create("http://test.host/dir2/affe.htm"))
59
59
  f = UrlFilters::UrlExclusionFilter.new([ /\/dir1/ ])
60
- assert_nil f.apply(Document.new("http://test.host/dir1/affe.aspx"))
61
- assert_nil f.apply(Document.new("http://test.host/dir2/dir1/affe.htm"))
62
- assert_not_nil f.apply(Document.new("http://test.host/affe.htm"))
63
- assert_not_nil f.apply(Document.new("http://test.host/dir2/affe.htm"))
60
+ assert_nil f.apply(Document.create("http://test.host/dir1/affe.aspx"))
61
+ assert_nil f.apply(Document.create("http://test.host/dir2/dir1/affe.htm"))
62
+ assert_not_nil f.apply(Document.create("http://test.host/affe.htm"))
63
+ assert_not_nil f.apply(Document.create("http://test.host/dir2/affe.htm"))
64
64
  end
65
65
 
66
66
  def test_hostname_filter
67
67
  include_hosts = [ 'test.host', 'localhost' ]
68
- assert_nil UrlFilters.hostname_filter(Document.new('http://google.com/'), include_hosts)
69
- assert_not_nil UrlFilters.hostname_filter(Document.new('http://test.host/file.html'), include_hosts)
70
- assert_not_nil UrlFilters.hostname_filter(Document.new('http://localhost/file.html'), include_hosts)
68
+ assert_nil UrlFilters.hostname_filter(Document.create('http://google.com/'), include_hosts)
69
+ assert_not_nil UrlFilters.hostname_filter(Document.create('http://test.host/file.html'), include_hosts)
70
+ assert_not_nil UrlFilters.hostname_filter(Document.create('http://localhost/file.html'), include_hosts)
71
71
  end
72
72
 
73
73
  def test_fix_relative_uri
74
- doc = Document.new('http://test.host/dir/file.html')
74
+ doc = Document.create('http://test.host/dir/file.html')
75
75
  assert_equal('http://test.host/dir/another.html',
76
- UrlFilters.fix_relative_uri(Document.new('another.html', doc.uri)).uri.to_s)
76
+ UrlFilters.fix_relative_uri(Document.create('another.html', doc.uri)).uri.to_s)
77
77
  assert_equal('http://test.host/dir/../another.html',
78
- UrlFilters.fix_relative_uri(Document.new('../another.html', doc.uri)).uri.to_s)
78
+ UrlFilters.fix_relative_uri(Document.create('../another.html', doc.uri)).uri.to_s)
79
79
  assert_equal('http://test.host/dir/another.html',
80
- UrlFilters.fix_relative_uri(Document.new('/dir/another.html', doc.uri)).uri.to_s)
80
+ UrlFilters.fix_relative_uri(Document.create('/dir/another.html', doc.uri)).uri.to_s)
81
81
  assert_equal('http://test.host/dir/another.html',
82
- UrlFilters.fix_relative_uri(Document.new('http://test.host/dir/another.html', doc.uri)).uri.to_s)
82
+ UrlFilters.fix_relative_uri(Document.create('http://test.host/dir/another.html', doc.uri)).uri.to_s)
83
83
  assert_equal('HTTP://test.host/dir/another.html',
84
- UrlFilters.fix_relative_uri(Document.new('HTTP://test.host/dir/another.html', doc.uri)).uri.to_s)
85
- doc = Document.new('https://test.host/dir/')
84
+ UrlFilters.fix_relative_uri(Document.create('HTTP://test.host/dir/another.html', doc.uri)).uri.to_s)
85
+ doc = Document.create('https://test.host/dir/')
86
86
  assert_equal('https://test.host/dir/another.html',
87
- UrlFilters.fix_relative_uri(Document.new('another.html', doc.uri)).uri.to_s)
88
- doc = Document.new('https://test.host/')
87
+ UrlFilters.fix_relative_uri(Document.create('another.html', doc.uri)).uri.to_s)
88
+ doc = Document.create('https://test.host/')
89
89
  assert_equal('https://test.host/another.html',
90
- UrlFilters.fix_relative_uri(Document.new('another.html', doc.uri)).uri.to_s)
91
- doc = Document.new('https://test.host')
90
+ UrlFilters.fix_relative_uri(Document.create('another.html', doc.uri)).uri.to_s)
91
+ doc = Document.create('https://test.host')
92
92
  assert_equal('https://test.host/another.html',
93
- UrlFilters.fix_relative_uri(Document.new('another.html', doc.uri)).uri.to_s)
93
+ UrlFilters.fix_relative_uri(Document.create('another.html', doc.uri)).uri.to_s)
94
94
  end
95
95
  end
96
96
 
@@ -3,7 +3,7 @@ class WordContentExtractorTest < Test::Unit::TestCase
3
3
  include TestHelper
4
4
 
5
5
  def setup
6
- @ce = ContentExtractors::WordContentExtractor.new
6
+ @ce = ContentExtractors::WordContentExtractor.new(RDig.configuration.content_extraction)
7
7
  end
8
8
 
9
9
  def test_can_do
metadata CHANGED
@@ -1,10 +1,10 @@
1
1
  --- !ruby/object:Gem::Specification
2
- rubygems_version: 0.8.11
2
+ rubygems_version: 0.8.11.15
3
3
  specification_version: 1
4
4
  name: rdig
5
5
  version: !ruby/object:Gem::Version
6
- version: 0.2.1
7
- date: 2006-04-20 00:00:00 +02:00
6
+ version: 0.3.0
7
+ date: 2006-04-26 00:00:00 +02:00
8
8
  summary: Ruby based web site indexing and searching library.
9
9
  require_paths:
10
10
  - lib
@@ -25,6 +25,7 @@ required_ruby_version: !ruby/object:Gem::Version::Requirement
25
25
  platform: ruby
26
26
  signing_key:
27
27
  cert_chain:
28
+ post_install_message:
28
29
  authors:
29
30
  - Jens Kraemer
30
31
  files:
@@ -32,13 +33,14 @@ files:
32
33
  - lib/rdig
33
34
  - lib/htmlentities
34
35
  - lib/rdig.rb
35
- - lib/rdig/http_client.rb
36
36
  - lib/rdig/crawler.rb
37
37
  - lib/rdig/search.rb
38
38
  - lib/rdig/highlight.rb
39
39
  - lib/rdig/index.rb
40
40
  - lib/rdig/url_filters.rb
41
41
  - lib/rdig/content_extractors.rb
42
+ - lib/rdig/documents.rb
43
+ - lib/rdig/file.rb
42
44
  - lib/htmlentities/CHANGES
43
45
  - lib/htmlentities/COPYING
44
46
  - lib/htmlentities/README
@@ -51,6 +53,8 @@ files:
51
53
  - test/unit/html_content_extractor_test.rb
52
54
  - test/unit/pdf_content_extractor_test.rb
53
55
  - test/unit/word_content_extractor_test.rb
56
+ - test/unit/file_document_test.rb
57
+ - test/unit/crawler_fs_test.rb
54
58
  - test/fixtures/html
55
59
  - test/fixtures/pdf
56
60
  - test/fixtures/word
@@ -1,22 +0,0 @@
1
- module RDig
2
-
3
- module HttpClient
4
- def do_get(uri, user_agent='RDig crawler')
5
- # Set up the appropriate http headers
6
- headers = { "User-Agent" => user_agent }
7
- result = {}
8
-
9
- begin
10
- Net::HTTP.start(uri.host, (uri.port or 80)) { |http|
11
- final_uri = uri.path
12
- final_uri += ('?' + uri.query) if uri.query
13
- return http.get(final_uri, headers)
14
- }
15
- rescue => error
16
- puts error
17
- end
18
- end
19
- end
20
-
21
- end
22
-