rdig 0.3.4 → 0.3.5

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGES CHANGED
@@ -1,3 +1,11 @@
1
+ 0.3.5
2
+ - Add max_depth option to crawler configuration for limiting the crawl to a
3
+ specific depth
4
+ - add support for http proxies including basic authentication
5
+ - remove rubyfoul_soup support
6
+
7
+ 0.3.4
8
+
1
9
  0.3.2
2
10
  - make RDig compatible with Ferret 0.10.x
3
11
  - won't work any more with Ferret 0.9.x and before
@@ -2,6 +2,12 @@ RDig.configuration do |cfg|
2
2
 
3
3
  ##################################################################
4
4
  # options you really should set
5
+
6
+ # log file location
7
+ cfg.log_file = '/tmp/rdig.log'
8
+
9
+ # log level, set to :debug, :info, :warn or :error
10
+ cfg.log_level = :info
5
11
 
6
12
  # provide one or more URLs for the crawler to start from
7
13
  cfg.crawler.start_urls = [ 'http://www.example.com/' ]
@@ -29,10 +35,11 @@ RDig.configuration do |cfg|
29
35
  # content extraction options
30
36
  cfg.content_extraction = OpenStruct.new(
31
37
 
32
- # HPRICOT configuration
33
- # this is the html parser used by default from RDig 0.3.3 upwards.
34
- # Hpricot by far outperforms Rubyful Soup, and is at least as flexible when
35
- # it comes to selection of portions of the html documents.
38
+ # HPRICOT configuration
39
+ # hpricot is the html parsing lib used by RDig. See
40
+ # http://code.whytheluckystiff.net/hpricot for usage information.
41
+ # Any code blocks given for content selection will receive an Hpricot instance
42
+ # containing the full page content when called.
36
43
  :hpricot => OpenStruct.new(
37
44
  # css selector for the element containing the page title
38
45
  :title_tag_selector => 'title',
@@ -42,26 +49,6 @@ RDig.configuration do |cfg|
42
49
  # might also be a proc returning either an element or a string:
43
50
  # :content_tag_selector => lambda { |hpricot_doc| ... }
44
51
  )
45
-
46
- # RUBYFUL SOUP
47
- # This is a powerful, but somewhat slow, ruby-only html parsing lib which was
48
- # RDig's default html parser up to version 0.3.2. To use it, comment the
49
- # hpricot config above, and uncomment the following:
50
- #
51
- # :rubyful_soup => OpenStruct.new(
52
- # # provide a method that returns the title of an html document
53
- # # this method may either return a tag to extract the title from,
54
- # # or a ready-to-index string.
55
- # :content_tag_selector => lambda { |tagsoup|
56
- # tagsoup.html.body
57
- # },
58
- # # provide a method that selects the tag containing the page content you
59
- # # want to index. Useful to avoid indexing common elements like navigation
60
- # # and page footers for every page.
61
- # :title_tag_selector => lambda { |tagsoup|
62
- # tagsoup.html.head.title
63
- # }
64
- # )
65
52
  )
66
53
 
67
54
  # crawler options
@@ -95,12 +82,25 @@ RDig.configuration do |cfg|
95
82
  # crawls on slow sites. Don't set to 0, even when crawling a local fs.
96
83
  # cfg.crawler.wait_before_leave = 10
97
84
 
85
+ # limit the crawling depth. Default: nil (unlimited)
86
+ # Set to 0 to only index the start_urls.
87
+ # cfg.crawler.max_depth = nil
88
+
89
+ # http proxy configuration
90
+ # proxy url
91
+ # cfg.crawler.http_proxy = nil
92
+ #
93
+ # proxy username
94
+ # cfg.crawler.http_proxy_user = nil
95
+ # proxy password
96
+ # cfg.crawler.http_proxy_pass = nil
97
+
98
98
  # indexer options
99
99
 
100
100
  # create a new index on each run. Will append to the index if false. Use when
101
101
  # building a single index from multiple runs, e.g. one across a website and the
102
102
  # other a tree in a local file system
103
- # config.index.create = true
103
+ # cfg.index.create = true
104
104
 
105
105
  # rewrite document uris before indexing them. This is useful if you're
106
106
  # indexing on disk, but the documents should be accessible via http, e.g. from
@@ -24,7 +24,7 @@
24
24
  #++
25
25
  #
26
26
 
27
- RDIGVERSION = '0.3.4'
27
+ RDIGVERSION = '0.3.5'
28
28
 
29
29
 
30
30
  require 'thread'
@@ -39,6 +39,8 @@ require 'net/http'
39
39
  require 'getoptlong'
40
40
  require 'tempfile'
41
41
  require 'open-uri'
42
+ require 'logger'
43
+ require 'base64'
42
44
 
43
45
  begin
44
46
  require 'ferret'
@@ -69,10 +71,11 @@ module RDig
69
71
  :scheme_filter_http,
70
72
  :fix_relative_uri,
71
73
  :normalize_uri,
74
+ { RDig::UrlFilters::DepthFilter => :max_depth },
72
75
  { :hostname_filter => :include_hosts },
73
76
  { RDig::UrlFilters::UrlInclusionFilter => :include_documents },
74
77
  { RDig::UrlFilters::UrlExclusionFilter => :exclude_documents },
75
- RDig::UrlFilters::VisitedUrlFilter
78
+ RDig::UrlFilters::VisitedUrlFilter
76
79
  ],
77
80
  # filter chain for file system crawling
78
81
  :file => [
@@ -103,6 +106,8 @@ module RDig
103
106
  yield configuration
104
107
  else
105
108
  @config ||= OpenStruct.new(
109
+ :log_file => '/tmp/rdig.log',
110
+ :log_level => :warn,
106
111
  :crawler => OpenStruct.new(
107
112
  :start_urls => [ "http://localhost:3000/" ],
108
113
  :include_hosts => [ "localhost" ],
@@ -111,7 +116,11 @@ module RDig
111
116
  :index_document => nil,
112
117
  :num_threads => 2,
113
118
  :max_redirects => 5,
114
- :wait_before_leave => 10
119
+ :max_depth => nil,
120
+ :wait_before_leave => 10,
121
+ :http_proxy => nil,
122
+ :http_proxy_user => nil,
123
+ :http_proxy_pass => nil
115
124
  ),
116
125
  :content_extraction => OpenStruct.new(
117
126
  # settings for html content extraction (hpricot)
@@ -124,19 +133,6 @@ module RDig
124
133
  # might also be a proc returning either an element or a string:
125
134
  # :content_tag_selector => lambda { |hpricot_doc| ... }
126
135
  )
127
- #,
128
- # # settings for html content extraction (RubyfulSoup)
129
- # :rubyful_soup => OpenStruct.new(
130
- # # select the html element that contains the content to index
131
- # # by default, we index all inside the body tag:
132
- # :content_tag_selector => lambda { |tagsoup|
133
- # tagsoup.html.body
134
- # },
135
- # # select the html element containing the title
136
- # :title_tag_selector => lambda { |tagsoup|
137
- # tagsoup.html.head.title
138
- # }
139
- # )
140
136
  ),
141
137
  :index => OpenStruct.new(
142
138
  :path => "index/",
@@ -151,6 +147,36 @@ module RDig
151
147
  end
152
148
  alias config configuration
153
149
 
150
+ def logger
151
+ @logger ||= create_logger
152
+ end
153
+
154
+ def logger=(log)
155
+ @logger = log
156
+ end
157
+
158
+ def create_logger
159
+ l = Logger.new(RDig.config.log_file)
160
+ l.level = Logger.const_get RDig.config.log_level.to_s.upcase rescue Logger::WARN
161
+ return l
162
+ end
163
+
164
+ # returns http options for open_uri if configured
165
+ def open_uri_http_options
166
+ unless RDig::configuration.crawler.open_uri_http_options
167
+ opts = {}
168
+ if RDig::configuration.crawler.http_proxy
169
+ opts[:proxy] = RDig::configuration.crawler.http_proxy
170
+ if user = RDig::configuration.crawler.http_proxy_user
171
+ pass = RDig::configuration.crawler.http_proxy_pass
172
+ opts['Authorization'] = "Basic " + Base64.encode64("#{user}:#{pass}")
173
+ end
174
+ end
175
+ RDig::configuration.crawler.open_uri_http_options = opts
176
+ end
177
+ return RDig::configuration.crawler.open_uri_http_options
178
+ end
179
+
154
180
  end
155
181
 
156
182
  class Application
@@ -210,7 +236,6 @@ module RDig
210
236
  when '--query'
211
237
  options.query = value
212
238
  when '--version'
213
- puts "rdig, version #{RDIGVERSION}"
214
239
  exit
215
240
  else
216
241
  fail "Unknown option: #{opt}"
@@ -22,7 +22,7 @@ module RDig
22
22
  def self.extractors; @@extractors ||= [] end
23
23
  def self.extractor_instances
24
24
  @@extractor_instances ||= extractors.map { |ex_class|
25
- puts "initializing content extractor: #{ex_class}" if RDig.configuration.verbose
25
+ RDig.logger.info "initializing content extractor: #{ex_class}"
26
26
  ex_class.new(RDig.configuration.content_extraction) rescue nil
27
27
  }.compact
28
28
  end
@@ -77,8 +77,8 @@ end
77
77
  # load content extractors
78
78
  Dir["#{File.expand_path(File.dirname(__FILE__))}/content_extractors/**/*.rb"].each do |f|
79
79
  begin
80
- require f
80
+ require f
81
81
  rescue LoadError
82
- puts "could not load #{f}: #{$!}"
82
+ RDig::logger.error "could not load #{f}: #{$!}"
83
83
  end
84
84
  end
@@ -40,12 +40,11 @@ module RDig
40
40
  # all textual content contained in the root element and all it's
41
41
  # children.
42
42
  def extract_content(doc)
43
- content = ''
44
- ce = content_element(doc)
45
- content = strip_tags(strip_comments(ce.inner_html)) if ce
46
- # (ce/'h1, h2, h3, h4, h5, h6, p, li, dt, dd, td, address, option, ').each do |child|
47
- # extract_text child, content
48
- return content.strip
43
+ if ce = content_element(doc)
44
+ return strip_tags(strip_comments(ce.inner_html))
45
+ end
46
+ # return (ce.inner_text || '').gsub(Regexp.new('\s+', Regexp::MULTILINE, 'u'), ' ').strip
47
+ return ''
49
48
  end
50
49
 
51
50
  # extracts the href attributes of all a tags, except
@@ -91,7 +90,8 @@ module RDig
91
90
  Regexp::MULTILINE, 'u'), ''
92
91
  string.gsub! Regexp.new('<.+?>',
93
92
  Regexp::MULTILINE, 'u'), ''
94
- string.gsub Regexp.new('\s+', Regexp::MULTILINE, 'u'), ' '
93
+ string.gsub! Regexp.new('\s+', Regexp::MULTILINE, 'u'), ' '
94
+ string.strip
95
95
  end
96
96
 
97
97
  end
@@ -3,23 +3,30 @@ module RDig
3
3
 
4
4
  class Crawler
5
5
 
6
- def initialize
6
+ def initialize(config = RDig.config, logger = RDig.logger)
7
7
  @documents = Queue.new
8
8
  @etag_filter = ETagFilter.new
9
+ @logger = logger
10
+ @config = config
9
11
  end
10
12
 
11
13
  def run
12
- raise 'no start urls given!' if RDig.config.crawler.start_urls.empty?
13
- @indexer = Index::Indexer.new(RDig.config.index)
14
-
14
+ @indexer = Index::Indexer.new(@config.index)
15
+ crawl
16
+ ensure
17
+ @indexer.close if @indexer
18
+ end
19
+
20
+ def crawl
21
+ raise 'no start urls given!' if @config.crawler.start_urls.empty?
15
22
  # check whether we are indexing on-disk or via http
16
- url_type = RDig.config.crawler.start_urls.first =~ /^file:\/\// ? :file : :http
23
+ url_type = @config.crawler.start_urls.first =~ /^file:\/\// ? :file : :http
17
24
  chain_config = RDig.filter_chain[url_type]
18
25
 
19
26
  filterchain = UrlFilters::FilterChain.new(chain_config)
20
- RDig.config.crawler.start_urls.each { |url| add_url(url, filterchain) }
21
-
22
- num_threads = RDig.config.crawler.num_threads
27
+ @config.crawler.start_urls.each { |url| add_url(url, filterchain) }
28
+
29
+ num_threads = @config.crawler.num_threads
23
30
  group = ThreadsWait.new
24
31
  num_threads.times { |i|
25
32
  group.join_nowait Thread.new("fetcher #{i}") {
@@ -31,20 +38,19 @@ module RDig
31
38
  }
32
39
 
33
40
  # check for an empty queue every now and then
34
- sleep_interval = RDig.config.crawler.wait_before_leave
41
+ sleep_interval = @config.crawler.wait_before_leave
35
42
  begin
36
43
  sleep sleep_interval
37
44
  end until @documents.empty?
38
45
  # nothing to do any more, tell the threads to exit
39
46
  num_threads.times { @documents << :exit }
40
47
 
41
- puts "waiting for threads to finish..."
48
+ @logger.info "waiting for threads to finish..."
42
49
  group.all_waits
43
- ensure
44
- @indexer.close if @indexer
45
50
  end
46
51
 
47
52
  def process_document(doc, filterchain)
53
+ @logger.debug "processing document #{doc}"
48
54
  doc.fetch
49
55
  # add links from this document to the queue
50
56
  doc.content[:links].each { |url|
@@ -52,10 +58,14 @@ module RDig
52
58
  } unless doc.content[:links].nil?
53
59
 
54
60
  return unless @etag_filter.apply(doc)
55
- @indexer << doc if doc.needs_indexing?
61
+ add_to_index doc
56
62
  rescue
57
- puts "error processing document #{doc.uri.to_s}: #{$!}"
58
- puts "Trace: #{$!.backtrace.join("\n")}" if RDig::config.verbose
63
+ @logger.error "error processing document #{doc.uri.to_s}: #{$!}"
64
+ @logger.debug "Trace: #{$!.backtrace.join("\n")}"
65
+ end
66
+
67
+ def add_to_index(doc)
68
+ @indexer << doc if doc.needs_indexing?
59
69
  end
60
70
 
61
71
 
@@ -64,17 +74,19 @@ module RDig
64
74
  # processing
65
75
  def add_url(url, filterchain, referring_document = nil)
66
76
  return if url.nil? || url.empty?
67
- if referring_document and referring_document.uri.scheme =~ /^https?/i
68
- doc = Document.create(url, referring_document.uri)
77
+
78
+ @logger.debug "add_url #{url}"
79
+ doc = if referring_document
80
+ referring_document.create_child(url)
69
81
  else
70
- doc = Document.create(url)
82
+ Document.create(url)
71
83
  end
72
84
 
73
85
  doc = filterchain.apply(doc)
74
86
 
75
87
  if doc
76
88
  @documents << doc
77
- puts "added url #{url}" if RDig::config.verbose
89
+ @logger.debug "url #{url} survived filterchain"
78
90
  end
79
91
  rescue
80
92
  nil
@@ -9,30 +9,23 @@ module RDig
9
9
  attr_reader :content
10
10
  attr_reader :content_type
11
11
 
12
- def self.create(url, referrer_uri = nil)
13
- # a referrer is a clear enough hint to create an HttpDocument
14
- if referrer_uri && referrer_uri.scheme =~ /^https?$/i
15
- return HttpDocument.new(:url => url, :referrer => referrer_uri)
16
- end
17
-
18
- case url
19
- when /^https?:\/\//i
20
- HttpDocument.new(:url => url, :referrer => referrer_uri) if referrer_uri.nil?
21
- when /^file:\/\//i
22
- # files don't have referrers - the check for nil prevents us from being
23
- # tricked into indexing local files by file:// links in the web site
24
- # we index.
25
- FileDocument.new(:url => url) if referrer_uri.nil?
12
+ def self.create(url)
13
+ return case url
14
+ when /^https?:\/\//i
15
+ HttpDocument.new(:uri => url)
16
+ when /^file:\/\//i
17
+ FileDocument.new(:uri => url)
26
18
  end
27
19
  end
28
20
 
29
21
  # url: url of this document, may be relative to the referring doc or host.
30
22
  # referrer: uri of the document we retrieved this link from
31
23
  def initialize(args)
24
+ RDig.logger.debug "initialize: #{args.inspect}"
32
25
  begin
33
- @uri = URI.parse(args[:url])
26
+ @uri = URI.parse(args[:uri])
34
27
  rescue URI::InvalidURIError
35
- raise "Cannot create document using invalid URL: #{args[:url]}"
28
+ raise "Cannot create document using invalid URL: #{args[:uri]}"
36
29
  end
37
30
  end
38
31
 
@@ -48,6 +41,10 @@ module RDig
48
41
  !self.content.nil?
49
42
  end
50
43
 
44
+ def to_s
45
+ "#{self.class.name}, uri=#{uri}, title=#{has_content? ? title : 'not loaded yet'}"
46
+ end
47
+
51
48
  end
52
49
 
53
50
 
@@ -59,14 +56,17 @@ module RDig
59
56
  super(args)
60
57
  end
61
58
 
59
+ def create_child(uri)
60
+ FileDocument.new(:uri => uri)
61
+ end
62
+
62
63
  def self.find_files(path)
63
64
  links = []
65
+ pattern = /.+\.(#{File::FILE_EXTENSION_MIME_TYPES.keys.join('|')})$/i
64
66
  Dir.glob(File.expand_path(File.join(path, '*'))) do |filename|
67
+ RDig.logger.debug "checking file #{filename}"
65
68
  # Skip files not matching known mime types
66
- pattern = /.+\.(#{File::FILE_EXTENSION_MIME_TYPES.keys.join('|')})$/i
67
- if File.directory?(filename) || filename =~ pattern
68
- links << "file://#{filename}"
69
- end
69
+ links << "file://#{filename}" if File.directory?(filename) || filename =~ pattern
70
70
  end
71
71
  links
72
72
  end
@@ -97,20 +97,27 @@ module RDig
97
97
  #
98
98
  class HttpDocument < Document
99
99
 
100
+ # counts how far this document is away from one of the start urls. Used to limit crawling by depth.
101
+ attr_reader :depth
100
102
  attr_reader :referring_uri
101
103
  attr_reader :status
102
104
  attr_reader :etag
105
+
106
+ def create_child(uri)
107
+ HttpDocument.new(:uri => uri, :referrer => self.uri, :depth => self.depth+1) unless uri =~ /^file:\/\//i
108
+ end
103
109
 
104
110
  # url: url of this document, may be relative to the referring doc or host.
105
111
  # referrer: uri of the document we retrieved this link from
106
112
  def initialize(args={})
107
113
  super(args)
108
114
  @referring_uri = args[:referrer]
115
+ @depth = args[:depth] || 0
109
116
  end
110
117
 
111
118
  def fetch
112
- puts "fetching #{@uri.to_s}" if RDig::config.verbose
113
- open(@uri.to_s) do |doc|
119
+ RDig.logger.debug "fetching #{@uri.to_s}"
120
+ open(@uri.to_s, RDig::open_uri_http_options) do |doc|
114
121
  case doc.status.first.to_i
115
122
  when 200
116
123
  @etag = doc.meta['etag']
@@ -118,13 +125,13 @@ module RDig
118
125
  @content = ContentExtractors.process(doc.read, doc.content_type)
119
126
  @status = :success
120
127
  when 404
121
- puts "got 404 for #{@uri}"
128
+ RDig.logger.info "got 404 for #{@uri}"
122
129
  else
123
- puts "don't know what to do with response: #{doc.status.join(' : ')}"
130
+ RDig.logger.info "don't know what to do with response: #{doc.status.join(' : ')}"
124
131
  end
125
132
  end
126
133
  rescue
127
- puts "error fetching #{@uri.to_s}: #{$!}" if RDig::config.verbose
134
+ RDig.logger.warn "error fetching #{@uri.to_s}: #{$!}"
128
135
  ensure
129
136
  @content ||= {}
130
137
  end
@@ -15,7 +15,7 @@ module RDig
15
15
  end
16
16
 
17
17
  def add_to_index(document)
18
- puts "add to index: #{document.uri.to_s}" if RDig::config.verbose
18
+ RDig.logger.debug "add to index: #{document.uri.to_s}"
19
19
  @config.rewrite_uri.call(document.uri) if @config.rewrite_uri
20
20
  # all stored and tokenized, should be ferret defaults
21
21
  doc = {
@@ -43,7 +43,7 @@ module RDig
43
43
  def search(query, options={})
44
44
  result = {}
45
45
  query = query_parser.parse(query) if query.is_a?(String)
46
- puts "Query: #{query}"
46
+ RDig.logger.info "Query: #{query}"
47
47
  results = []
48
48
  searcher = ferret_searcher
49
49
  result[:hitcount] = searcher.search_each(query, options) do |doc_id, score|
@@ -80,6 +80,15 @@ module RDig
80
80
  end
81
81
  end
82
82
 
83
+ class DepthFilter
84
+ def initialize(max_depth = nil)
85
+ @max_depth = max_depth
86
+ end
87
+ def apply(document)
88
+ return document if @max_depth.nil? || document.depth <= @max_depth
89
+ end
90
+ end
91
+
83
92
 
84
93
  # base class for url inclusion / exclusion filters
85
94
  class PatternFilter
@@ -98,6 +107,7 @@ module RDig
98
107
  end
99
108
  end
100
109
  end
110
+
101
111
  class UrlExclusionFilter < PatternFilter
102
112
  # returns nil if any of the patterns matches it's URI,
103
113
  # the document itself otherwise
@@ -176,9 +186,11 @@ module RDig
176
186
  p document.uri
177
187
  end
178
188
 
189
+ # filter uris by hostname list. With a nil or empty list all documents may
190
+ # pass this filter.
179
191
  def UrlFilters.hostname_filter(document, include_hosts)
180
- return document if include_hosts.include?(document.uri.host)
181
- return nil
192
+ #RDig.logger.debug "hostname_filter: #{include_hosts}"
193
+ return document if include_hosts.nil? || include_hosts.empty? || include_hosts.include?(document.uri.host)
182
194
  end
183
195
 
184
196
  def UrlFilters.normalize_uri(document)
data/rakefile CHANGED
@@ -21,7 +21,7 @@ end
21
21
  PKG_NAME = 'rdig'
22
22
 
23
23
  # Determine the current version of the software
24
- if `ruby -Ilib ./bin/rdig --version` =~ /rdig, version ([0-9.]+)$/
24
+ if `ruby -Ilib ./bin/rdig --version` =~ /RDig version ([0-9.]+)$/
25
25
  CURRENT_VERSION = $1
26
26
  else
27
27
  CURRENT_VERSION = "0.0.0"
@@ -131,10 +131,7 @@ else
131
131
  #### Dependencies and requirements.
132
132
 
133
133
  s.add_dependency('ferret', '>= 0.10.0')
134
- # TODO: check if there is anything like 'suggested' instead of required, or
135
- # ORed dependencies...
136
- #s.add_dependency('rubyful_soup', '>= 1.0.4')
137
- s.add_dependency('hpricot', '>= 0.4')
134
+ s.add_dependency('hpricot', '>= 0.6')
138
135
  #s.requirements << ""
139
136
 
140
137
  #### Which files are to be included in this gem? Everything! (Except CVS directories.)
@@ -282,9 +279,9 @@ task :prerelease do
282
279
  announce "Release Task Testing, skipping checked-in file test"
283
280
  else
284
281
  announce "Checking for unchecked-in files..."
285
- data = `svn st`
286
- unless data =~ /^$/
287
- fail "SVN status is not clean ... do you have unchecked-in files?"
282
+ data = `git status`
283
+ unless data =~ /working directory clean/
284
+ fail "GIT status is not clean ... do you have unchecked-in files?"
288
285
  end
289
286
  announce "No outstanding checkins found ... OK"
290
287
  end
@@ -310,7 +307,8 @@ task :update_version => [:prerelease] do
310
307
  if ENV['RELTEST']
311
308
  announce "Release Task Testing, skipping commiting of new version"
312
309
  else
313
- sh %{svn commit -m "Updated to version #{PKG_VERSION}" lib/rdig.rb}
310
+ sh %{git commit -a -m "Updated to version #{PKG_VERSION}" lib/rdig.rb}
311
+ sh %{git svn dcommit}
314
312
  end
315
313
  end
316
314
  end
@@ -0,0 +1,17 @@
1
+ require 'test_helper'
2
+ class HttpDocumentTest < Test::Unit::TestCase
3
+ include TestHelper
4
+
5
+ def setup
6
+ @fixture_path = File.join(File.expand_path(File.dirname(__FILE__)), '../fixtures/')
7
+ end
8
+
9
+ def test_initialize
10
+ d = Document.create 'http://1stlineleewes.com'
11
+ assert_equal '1stlineleewes.com', d.uri.host
12
+ assert_equal '', d.uri.path
13
+ end
14
+
15
+ end
16
+
17
+
@@ -0,0 +1,38 @@
1
+ require 'test_helper'
2
+ class RDigTest < Test::Unit::TestCase
3
+ include TestHelper
4
+
5
+ def setup
6
+ RDig.configuration do |cfg|
7
+ @old_crawler_cfg = cfg.crawler.clone
8
+ cfg.log_level = :debug
9
+ cfg.log_file = 'tmp/test.log'
10
+ end
11
+ end
12
+
13
+ def teardown
14
+ RDig.configuration do |cfg|
15
+ cfg.crawler = @old_crawler_cfg
16
+ end
17
+ end
18
+
19
+ def test_proxy_config
20
+ RDig.configuration do |cfg|
21
+ cfg.crawler.http_proxy = 'http://proxy.com:8080'
22
+ end
23
+ assert_equal 'http://proxy.com:8080', RDig.open_uri_http_options[:proxy]
24
+ assert_nil RDig.open_uri_http_options['Authorization']
25
+ end
26
+
27
+ def test_proxy_auth
28
+ RDig.configuration do |cfg|
29
+ cfg.crawler.http_proxy = 'http://proxy.com:8080'
30
+ cfg.crawler.http_proxy_user = 'username'
31
+ cfg.crawler.http_proxy_pass = 'password'
32
+ end
33
+ assert_equal 'http://proxy.com:8080', RDig.open_uri_http_options[:proxy]
34
+ assert_equal "Basic dXNlcm5hbWU6cGFzc3dvcmQ=\n", RDig.open_uri_http_options['Authorization']
35
+ end
36
+ end
37
+
38
+
@@ -13,6 +13,8 @@ class SearcherTest < Test::Unit::TestCase
13
13
  cfg.crawler.wait_before_leave = 1
14
14
  cfg.index.path = index_dir
15
15
  cfg.verbose = true
16
+ cfg.log_level = :debug
17
+ cfg.log_file = 'tmp/test.log'
16
18
  end
17
19
  crawler = Crawler.new
18
20
  crawler.run
@@ -1,5 +1,5 @@
1
1
  require 'test_helper'
2
- class UrlFilterTest < Test::Unit::TestCase
2
+ class UrlFiltersTest < Test::Unit::TestCase
3
3
  include TestHelper, RDig
4
4
 
5
5
  def setup
@@ -73,24 +73,24 @@ class UrlFilterTest < Test::Unit::TestCase
73
73
  def test_fix_relative_uri
74
74
  doc = Document.create('http://test.host/dir/file.html')
75
75
  assert_equal('http://test.host/dir/another.html',
76
- UrlFilters.fix_relative_uri(Document.create('another.html', doc.uri)).uri.to_s)
76
+ UrlFilters.fix_relative_uri(doc.create_child('another.html')).uri.to_s)
77
77
  assert_equal('http://test.host/dir/../another.html',
78
- UrlFilters.fix_relative_uri(Document.create('../another.html', doc.uri)).uri.to_s)
78
+ UrlFilters.fix_relative_uri(doc.create_child('../another.html')).uri.to_s)
79
79
  assert_equal('http://test.host/dir/another.html',
80
- UrlFilters.fix_relative_uri(Document.create('/dir/another.html', doc.uri)).uri.to_s)
80
+ UrlFilters.fix_relative_uri(doc.create_child('/dir/another.html')).uri.to_s)
81
81
  assert_equal('http://test.host/dir/another.html',
82
- UrlFilters.fix_relative_uri(Document.create('http://test.host/dir/another.html', doc.uri)).uri.to_s)
82
+ UrlFilters.fix_relative_uri(doc.create_child('http://test.host/dir/another.html')).uri.to_s)
83
83
  assert_equal('HTTP://test.host/dir/another.html',
84
- UrlFilters.fix_relative_uri(Document.create('HTTP://test.host/dir/another.html', doc.uri)).uri.to_s)
84
+ UrlFilters.fix_relative_uri(doc.create_child('HTTP://test.host/dir/another.html')).uri.to_s)
85
85
  doc = Document.create('https://test.host/dir/')
86
86
  assert_equal('https://test.host/dir/another.html',
87
- UrlFilters.fix_relative_uri(Document.create('another.html', doc.uri)).uri.to_s)
87
+ UrlFilters.fix_relative_uri(doc.create_child('another.html')).uri.to_s)
88
88
  doc = Document.create('https://test.host/')
89
89
  assert_equal('https://test.host/another.html',
90
- UrlFilters.fix_relative_uri(Document.create('another.html', doc.uri)).uri.to_s)
90
+ UrlFilters.fix_relative_uri(doc.create_child('another.html')).uri.to_s)
91
91
  doc = Document.create('https://test.host')
92
92
  assert_equal('https://test.host/another.html',
93
- UrlFilters.fix_relative_uri(Document.create('another.html', doc.uri)).uri.to_s)
93
+ UrlFilters.fix_relative_uri(doc.create_child('another.html')).uri.to_s)
94
94
  end
95
95
  end
96
96
 
metadata CHANGED
@@ -1,74 +1,88 @@
1
1
  --- !ruby/object:Gem::Specification
2
- rubygems_version: 0.8.11
3
- specification_version: 1
4
2
  name: rdig
5
3
  version: !ruby/object:Gem::Version
6
- version: 0.3.4
7
- date: 2006-12-31 00:00:00 +01:00
8
- summary: Ruby based web site indexing and searching library.
9
- require_paths:
10
- - lib
11
- email: jk@jkraemer.net
12
- homepage: http://rdig.rubyforge.org/
13
- rubyforge_project: rdig
14
- description: RDig provides an HTTP crawler and content extraction utilities to help building a site search for web sites or intranets. Internally, Ferret is used for the full text indexing. After creating a config file for your site, the index can be built with a single call to rdig. For HTML page crawling, hpricot and rubyful_soup are supported.
15
- autorequire:
16
- default_executable: rdig
17
- bindir: bin
18
- has_rdoc: true
19
- required_ruby_version: !ruby/object:Gem::Version::Requirement
20
- requirements:
21
- - - ">"
22
- - !ruby/object:Gem::Version
23
- version: 0.0.0
24
- version:
4
+ version: 0.3.5
25
5
  platform: ruby
26
- signing_key:
27
- cert_chain:
28
6
  authors:
29
7
  - Jens Kraemer
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2008-02-26 00:00:00 +01:00
13
+ default_executable: rdig
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: ferret
17
+ version_requirement:
18
+ version_requirements: !ruby/object:Gem::Requirement
19
+ requirements:
20
+ - - ">="
21
+ - !ruby/object:Gem::Version
22
+ version: 0.10.0
23
+ version:
24
+ - !ruby/object:Gem::Dependency
25
+ name: hpricot
26
+ version_requirement:
27
+ version_requirements: !ruby/object:Gem::Requirement
28
+ requirements:
29
+ - - ">="
30
+ - !ruby/object:Gem::Version
31
+ version: "0.6"
32
+ version:
33
+ description: RDig provides an HTTP crawler and content extraction utilities to help building a site search for web sites or intranets. Internally, Ferret is used for the full text indexing. After creating a config file for your site, the index can be built with a single call to rdig. For HTML page crawling, hpricot and rubyful_soup are supported.
34
+ email: jk@jkraemer.net
35
+ executables:
36
+ - rdig
37
+ extensions: []
38
+
39
+ extra_rdoc_files:
40
+ - README
41
+ - CHANGES
42
+ - LICENSE
43
+ - TODO
30
44
  files:
31
45
  - bin/rdig
32
- - lib/rdig
33
- - lib/htmlentities
34
46
  - lib/rdig.rb
35
- - lib/rdig/content_extractors
36
- - lib/rdig/crawler.rb
37
- - lib/rdig/search.rb
38
- - lib/rdig/highlight.rb
39
- - lib/rdig/index.rb
47
+ - lib/rdig
40
48
  - lib/rdig/url_filters.rb
49
+ - lib/rdig/index.rb
50
+ - lib/rdig/crawler.rb
41
51
  - lib/rdig/content_extractors.rb
42
- - lib/rdig/documents.rb
43
52
  - lib/rdig/file.rb
44
- - lib/rdig/content_extractors/rubyful_soup.rb
53
+ - lib/rdig/highlight.rb
54
+ - lib/rdig/documents.rb
55
+ - lib/rdig/search.rb
56
+ - lib/rdig/content_extractors
45
57
  - lib/rdig/content_extractors/doc.rb
46
58
  - lib/rdig/content_extractors/hpricot.rb
47
59
  - lib/rdig/content_extractors/pdf.rb
48
- - lib/htmlentities/CHANGES
60
+ - lib/htmlentities
61
+ - lib/htmlentities/htmlentities.rb
49
62
  - lib/htmlentities/COPYING
63
+ - lib/htmlentities/CHANGES
50
64
  - lib/htmlentities/README
51
- - lib/htmlentities/htmlentities.rb
52
- - test/unit
53
65
  - test/fixtures
54
- - test/test_helper.rb
55
- - test/unit/etag_filter_test.rb
56
- - test/unit/url_filters_test.rb
57
- - test/unit/searcher_test.rb
58
- - test/unit/rubyful_soup_content_extractor_test.rb
59
- - test/unit/pdf_content_extractor_test.rb
60
- - test/unit/hpricot_content_extractor_test.rb
61
- - test/unit/word_content_extractor_test.rb
62
- - test/unit/file_document_test.rb
63
- - test/unit/crawler_fs_test.rb
64
- - test/fixtures/html
65
- - test/fixtures/pdf
66
66
  - test/fixtures/word
67
- - test/fixtures/html/entities.html
68
- - test/fixtures/html/simple.html
67
+ - test/fixtures/word/simple.doc
68
+ - test/fixtures/html
69
69
  - test/fixtures/html/custom_tag_selectors.html
70
+ - test/fixtures/html/simple.html
71
+ - test/fixtures/html/entities.html
72
+ - test/fixtures/pdf
70
73
  - test/fixtures/pdf/simple.pdf
71
- - test/fixtures/word/simple.doc
74
+ - test/unit
75
+ - test/unit/crawler_fs_test.rb
76
+ - test/unit/pdf_content_extractor_test.rb
77
+ - test/unit/word_content_extractor_test.rb
78
+ - test/unit/rdig_test.rb
79
+ - test/unit/http_document_test.rb
80
+ - test/unit/searcher_test.rb
81
+ - test/unit/file_document_test.rb
82
+ - test/unit/url_filters_test.rb
83
+ - test/unit/hpricot_content_extractor_test.rb
84
+ - test/unit/etag_filter_test.rb
85
+ - test/test_helper.rb
72
86
  - doc/examples
73
87
  - doc/examples/config.rb
74
88
  - LICENSE
@@ -77,41 +91,35 @@ files:
77
91
  - README
78
92
  - install.rb
79
93
  - rakefile
80
- test_files: []
81
-
94
+ has_rdoc: true
95
+ homepage: http://rdig.rubyforge.org/
96
+ post_install_message:
82
97
  rdoc_options:
83
98
  - --title
84
99
  - Rake -- Ruby Make
85
100
  - --main
86
101
  - README
87
102
  - --line-numbers
88
- extra_rdoc_files:
89
- - README
90
- - CHANGES
91
- - LICENSE
92
- - TODO
93
- executables:
94
- - rdig
95
- extensions: []
96
-
103
+ require_paths:
104
+ - lib
105
+ required_ruby_version: !ruby/object:Gem::Requirement
106
+ requirements:
107
+ - - ">="
108
+ - !ruby/object:Gem::Version
109
+ version: "0"
110
+ version:
111
+ required_rubygems_version: !ruby/object:Gem::Requirement
112
+ requirements:
113
+ - - ">="
114
+ - !ruby/object:Gem::Version
115
+ version: "0"
116
+ version:
97
117
  requirements: []
98
118
 
99
- dependencies:
100
- - !ruby/object:Gem::Dependency
101
- name: ferret
102
- version_requirement:
103
- version_requirements: !ruby/object:Gem::Version::Requirement
104
- requirements:
105
- - - ">="
106
- - !ruby/object:Gem::Version
107
- version: 0.10.0
108
- version:
109
- - !ruby/object:Gem::Dependency
110
- name: hpricot
111
- version_requirement:
112
- version_requirements: !ruby/object:Gem::Version::Requirement
113
- requirements:
114
- - - ">="
115
- - !ruby/object:Gem::Version
116
- version: "0.4"
117
- version:
119
+ rubyforge_project: rdig
120
+ rubygems_version: 1.0.1
121
+ signing_key:
122
+ specification_version: 2
123
+ summary: Ruby based web site indexing and searching library.
124
+ test_files: []
125
+
@@ -1,151 +0,0 @@
1
- begin
2
- require 'rubyful_soup'
3
- rescue LoadError
4
- require 'rubygems'
5
- require 'rubyful_soup' rescue nil
6
- end
7
-
8
- if defined?(BeautifulSoup)
9
-
10
- # override some methods concered with entity resolving
11
- # to convert them to strings
12
- class BeautifulStoneSoup
13
- # resolve unknown html entities using the htmlentities lib
14
- alias :orig_unknown_entityref :unknown_entityref
15
- def unknown_entityref(ref)
16
- if HTMLEntities::MAP.has_key?(ref)
17
- handle_data [HTMLEntities::MAP[ref]].pack('U')
18
- else
19
- orig_unknown_entityref ref
20
- end
21
- end
22
-
23
- # resolve numeric entities to utf8
24
- def handle_charref(ref)
25
- handle_data( ref.gsub(/([0-9]{1,7})/) {
26
- [$1.to_i].pack('U')
27
- }.gsub(/x([0-9a-f]{1,6})/i) {
28
- [$1.to_i(16)].pack('U')
29
- } )
30
- end
31
- end
32
-
33
- module RDig
34
- module ContentExtractors
35
-
36
- # extracts title, content and links from html documents
37
- class RubyfulSoupContentExtractor < ContentExtractor
38
-
39
- def initialize(config)
40
- super(config.rubyful_soup)
41
- # if not configured, refuse to handle any content:
42
- @pattern = /^(text\/(html|xml)|application\/(xhtml\+xml|xml))/ if config.rubyful_soup
43
- end
44
-
45
- # returns:
46
- # { :content => 'extracted clear text',
47
- # :meta => { :title => 'Title' },
48
- # :links => [array of urls] }
49
- def process(content)
50
- result = { }
51
- tag_soup = BeautifulSoup.new(content)
52
- result[:title] = extract_title(tag_soup)
53
- result[:links] = extract_links(tag_soup)
54
- result[:content] = extract_content(tag_soup)
55
- return result
56
- end
57
-
58
- # Extracts textual content from the HTML tree.
59
- #
60
- # - First, the root element to use is determined using the
61
- # +content_element+ method, which itself uses the content_tag_selector
62
- # from RDig.configuration.
63
- # - Then, this element is processed by +extract_text+, which will give
64
- # all textual content contained in the root element and all it's
65
- # children.
66
- def extract_content(tag_soup)
67
- content = ''
68
- ce = content_element(tag_soup)
69
- ce.children { |child|
70
- extract_text(child, content)
71
- } unless ce.nil?
72
- return content.strip
73
- end
74
-
75
- # extracts the href attributes of all a tags, except
76
- # internal links like <a href="#top">
77
- def extract_links(tagsoup)
78
- tagsoup.find_all('a').map { |link|
79
- CGI.unescapeHTML(link['href']) if (link['href'] && link['href'] !~ /^#/)
80
- }.compact
81
- end
82
-
83
- # Extracts the title from the given html tree
84
- def extract_title(tagsoup)
85
- the_title_tag = title_tag(tagsoup)
86
- if the_title_tag.is_a? String
87
- the_title_tag
88
- else
89
- title = ''
90
- extract_text(the_title_tag, title)
91
- title.strip
92
- end
93
- end
94
-
95
- # Recursively extracts all text contained in the given element,
96
- # and appends it to content.
97
- def extract_text(element, content='')
98
- return nil if element.nil?
99
- if element.is_a? NavigableString
100
- value = strip_comments(element)
101
- value.strip!
102
- unless value.empty?
103
- content << value
104
- content << ' '
105
- end
106
- elsif element.string # it's a Tag, and it has some content string
107
- # skip inline scripts and styles
108
- return nil if element.name =~ /^(script|style)$/i
109
- value = element.string.strip
110
- unless value.empty?
111
- content << value
112
- content << ' '
113
- end
114
- else
115
- element.children { |child|
116
- extract_text(child, content)
117
- }
118
- end
119
- end
120
-
121
- # Returns the element to extract the title from.
122
- #
123
- # This may return a string, e.g. an attribute value selected from a meta
124
- # tag, too.
125
- def title_tag(tagsoup)
126
- if @config.title_tag_selector
127
- @config.title_tag_selector.call(tagsoup)
128
- else
129
- tagsoup.html.head.title
130
- end
131
- end
132
-
133
- # Retrieve the root element to extract document content from
134
- def content_element(tagsoup)
135
- if @config.content_tag_selector
136
- @config.content_tag_selector.call(tagsoup)
137
- else
138
- tagsoup.html.body
139
- end
140
- end
141
-
142
- # Return the given string minus all html comments
143
- def strip_comments(string)
144
- string.gsub(Regexp.new('<!--.*?-->', Regexp::MULTILINE, 'u'), '')
145
- end
146
- end
147
-
148
- end
149
- end
150
-
151
- end
@@ -1,83 +0,0 @@
1
- require 'test_helper'
2
- class RubyfulSoupContentExtractorTest < Test::Unit::TestCase
3
- include TestHelper
4
-
5
- def setup
6
- @config = OpenStruct.new(
7
- :content_tag_selector => lambda { |tagsoup|
8
- tagsoup.html.body
9
- },
10
- :title_tag_selector => lambda { |tagsoup|
11
- tagsoup.html.head.title
12
- })
13
- @extractor = ContentExtractors::RubyfulSoupContentExtractor.new(OpenStruct.new(:rubyful_soup => @config))
14
- @nbsp = [160].pack('U') # non breaking space
15
- end
16
-
17
- def test_can_do
18
- assert !@extractor.can_do('application/pdf')
19
- assert !@extractor.can_do('application/msword')
20
- assert @extractor.can_do('text/html')
21
- assert @extractor.can_do('text/xml')
22
- assert @extractor.can_do('application/xml')
23
- assert @extractor.can_do('application/xhtml+xml')
24
- end
25
-
26
- def test_simple
27
- result = ContentExtractors.process(html_doc('simple'), 'text/html')
28
- assert_not_nil result
29
- assert_equal 'Sample Title', result[:title]
30
- assert_not_nil result[:content]
31
- assert_not_nil result[:links]
32
- assert_equal 1, result[:links].size
33
- assert_equal 'A Link Affe Some sample text Lorem ipsum', result[:content]
34
- assert_equal 'http://test.host/affe.html', result[:links].first
35
- end
36
-
37
- def test_entities
38
- result = @extractor.process(html_doc('entities'))
39
- assert_equal 'Sample & Title', result[:title]
40
- assert_equal 'http://test.host/affe.html?b=a&c=d', result[:links].first
41
- assert_equal 'http://test.host/affe2.html?b=a&c=d', result[:links].last
42
- assert_equal "Some > Links don't#{@nbsp}break me! Affe Affe Ümläuts heiß hier ß", result[:content]
43
- end
44
-
45
- def test_custom_content_element
46
- @config.title_tag_selector = lambda do |tagsoup|
47
- tagsoup.find('h1', :attrs => { 'class', 'title' })
48
- end
49
- @config.content_tag_selector = lambda do |tagsoup|
50
- tagsoup.find('div', :attrs => { 'id', 'content' })
51
- end
52
- result = @extractor.process(html_doc('custom_tag_selectors'))
53
- assert_equal 'Sample Title in h1', result[:title]
54
- assert_equal 'Affe Real content is here.', result[:content]
55
- # check if links are collected outside the content tag, too:
56
- assert_equal 3, result[:links].size
57
- assert_equal 'http://test.host/outside.html', result[:links].first
58
- assert_equal '/inside.html', result[:links][1]
59
- assert_equal '/footer.html', result[:links][2]
60
- end
61
-
62
-
63
- def test_title_from_dcmeta
64
- @config.title_tag_selector = lambda do |tagsoup|
65
- tagsoup.find('meta', :attrs => { 'name', 'DC.title' })['content']
66
- end
67
- result = @extractor.process(html_doc('custom_tag_selectors'))
68
- assert_equal 'Title from DC meta data', result[:title]
69
- end
70
-
71
- def test_preprocessed_title
72
- @config.title_tag_selector = lambda do |tagsoup|
73
- title = tagsoup.find('meta', :attrs => { 'name', 'DC.title' })['content']
74
- # use only a portion of the title tag's contents if it matches our
75
- # regexp:
76
- title =~ /^(.*)meta data$/ ? $1.strip : title.strip
77
- end
78
- result = @extractor.process(html_doc('custom_tag_selectors'))
79
- assert_equal 'Title from DC', result[:title]
80
- end
81
-
82
- end
83
-