rdig 0.3.4 → 0.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGES CHANGED
@@ -1,3 +1,11 @@
1
+ 0.3.5
2
+ - Add max_depth option to crawler configuration for limiting the crawl to a
3
+ specific depth
4
+ - add support for http proxies including basic authentication
5
+ - remove rubyfoul_soup support
6
+
7
+ 0.3.4
8
+
1
9
  0.3.2
2
10
  - make RDig compatible with Ferret 0.10.x
3
11
  - won't work any more with Ferret 0.9.x and before
@@ -2,6 +2,12 @@ RDig.configuration do |cfg|
2
2
 
3
3
  ##################################################################
4
4
  # options you really should set
5
+
6
+ # log file location
7
+ cfg.log_file = '/tmp/rdig.log'
8
+
9
+ # log level, set to :debug, :info, :warn or :error
10
+ cfg.log_level = :info
5
11
 
6
12
  # provide one or more URLs for the crawler to start from
7
13
  cfg.crawler.start_urls = [ 'http://www.example.com/' ]
@@ -29,10 +35,11 @@ RDig.configuration do |cfg|
29
35
  # content extraction options
30
36
  cfg.content_extraction = OpenStruct.new(
31
37
 
32
- # HPRICOT configuration
33
- # this is the html parser used by default from RDig 0.3.3 upwards.
34
- # Hpricot by far outperforms Rubyful Soup, and is at least as flexible when
35
- # it comes to selection of portions of the html documents.
38
+ # HPRICOT configuration
39
+ # hpricot is the html parsing lib used by RDig. See
40
+ # http://code.whytheluckystiff.net/hpricot for usage information.
41
+ # Any code blocks given for content selection will receive an Hpricot instance
42
+ # containing the full page content when called.
36
43
  :hpricot => OpenStruct.new(
37
44
  # css selector for the element containing the page title
38
45
  :title_tag_selector => 'title',
@@ -42,26 +49,6 @@ RDig.configuration do |cfg|
42
49
  # might also be a proc returning either an element or a string:
43
50
  # :content_tag_selector => lambda { |hpricot_doc| ... }
44
51
  )
45
-
46
- # RUBYFUL SOUP
47
- # This is a powerful, but somewhat slow, ruby-only html parsing lib which was
48
- # RDig's default html parser up to version 0.3.2. To use it, comment the
49
- # hpricot config above, and uncomment the following:
50
- #
51
- # :rubyful_soup => OpenStruct.new(
52
- # # provide a method that returns the title of an html document
53
- # # this method may either return a tag to extract the title from,
54
- # # or a ready-to-index string.
55
- # :content_tag_selector => lambda { |tagsoup|
56
- # tagsoup.html.body
57
- # },
58
- # # provide a method that selects the tag containing the page content you
59
- # # want to index. Useful to avoid indexing common elements like navigation
60
- # # and page footers for every page.
61
- # :title_tag_selector => lambda { |tagsoup|
62
- # tagsoup.html.head.title
63
- # }
64
- # )
65
52
  )
66
53
 
67
54
  # crawler options
@@ -95,12 +82,25 @@ RDig.configuration do |cfg|
95
82
  # crawls on slow sites. Don't set to 0, even when crawling a local fs.
96
83
  # cfg.crawler.wait_before_leave = 10
97
84
 
85
+ # limit the crawling depth. Default: nil (unlimited)
86
+ # Set to 0 to only index the start_urls.
87
+ # cfg.crawler.max_depth = nil
88
+
89
+ # http proxy configuration
90
+ # proxy url
91
+ # cfg.crawler.http_proxy = nil
92
+ #
93
+ # proxy username
94
+ # cfg.crawler.http_proxy_user = nil
95
+ # proxy password
96
+ # cfg.crawler.http_proxy_pass = nil
97
+
98
98
  # indexer options
99
99
 
100
100
  # create a new index on each run. Will append to the index if false. Use when
101
101
  # building a single index from multiple runs, e.g. one across a website and the
102
102
  # other a tree in a local file system
103
- # config.index.create = true
103
+ # cfg.index.create = true
104
104
 
105
105
  # rewrite document uris before indexing them. This is useful if you're
106
106
  # indexing on disk, but the documents should be accessible via http, e.g. from
@@ -24,7 +24,7 @@
24
24
  #++
25
25
  #
26
26
 
27
- RDIGVERSION = '0.3.4'
27
+ RDIGVERSION = '0.3.5'
28
28
 
29
29
 
30
30
  require 'thread'
@@ -39,6 +39,8 @@ require 'net/http'
39
39
  require 'getoptlong'
40
40
  require 'tempfile'
41
41
  require 'open-uri'
42
+ require 'logger'
43
+ require 'base64'
42
44
 
43
45
  begin
44
46
  require 'ferret'
@@ -69,10 +71,11 @@ module RDig
69
71
  :scheme_filter_http,
70
72
  :fix_relative_uri,
71
73
  :normalize_uri,
74
+ { RDig::UrlFilters::DepthFilter => :max_depth },
72
75
  { :hostname_filter => :include_hosts },
73
76
  { RDig::UrlFilters::UrlInclusionFilter => :include_documents },
74
77
  { RDig::UrlFilters::UrlExclusionFilter => :exclude_documents },
75
- RDig::UrlFilters::VisitedUrlFilter
78
+ RDig::UrlFilters::VisitedUrlFilter
76
79
  ],
77
80
  # filter chain for file system crawling
78
81
  :file => [
@@ -103,6 +106,8 @@ module RDig
103
106
  yield configuration
104
107
  else
105
108
  @config ||= OpenStruct.new(
109
+ :log_file => '/tmp/rdig.log',
110
+ :log_level => :warn,
106
111
  :crawler => OpenStruct.new(
107
112
  :start_urls => [ "http://localhost:3000/" ],
108
113
  :include_hosts => [ "localhost" ],
@@ -111,7 +116,11 @@ module RDig
111
116
  :index_document => nil,
112
117
  :num_threads => 2,
113
118
  :max_redirects => 5,
114
- :wait_before_leave => 10
119
+ :max_depth => nil,
120
+ :wait_before_leave => 10,
121
+ :http_proxy => nil,
122
+ :http_proxy_user => nil,
123
+ :http_proxy_pass => nil
115
124
  ),
116
125
  :content_extraction => OpenStruct.new(
117
126
  # settings for html content extraction (hpricot)
@@ -124,19 +133,6 @@ module RDig
124
133
  # might also be a proc returning either an element or a string:
125
134
  # :content_tag_selector => lambda { |hpricot_doc| ... }
126
135
  )
127
- #,
128
- # # settings for html content extraction (RubyfulSoup)
129
- # :rubyful_soup => OpenStruct.new(
130
- # # select the html element that contains the content to index
131
- # # by default, we index all inside the body tag:
132
- # :content_tag_selector => lambda { |tagsoup|
133
- # tagsoup.html.body
134
- # },
135
- # # select the html element containing the title
136
- # :title_tag_selector => lambda { |tagsoup|
137
- # tagsoup.html.head.title
138
- # }
139
- # )
140
136
  ),
141
137
  :index => OpenStruct.new(
142
138
  :path => "index/",
@@ -151,6 +147,36 @@ module RDig
151
147
  end
152
148
  alias config configuration
153
149
 
150
+ def logger
151
+ @logger ||= create_logger
152
+ end
153
+
154
+ def logger=(log)
155
+ @logger = log
156
+ end
157
+
158
+ def create_logger
159
+ l = Logger.new(RDig.config.log_file)
160
+ l.level = Logger.const_get RDig.config.log_level.to_s.upcase rescue Logger::WARN
161
+ return l
162
+ end
163
+
164
+ # returns http options for open_uri if configured
165
+ def open_uri_http_options
166
+ unless RDig::configuration.crawler.open_uri_http_options
167
+ opts = {}
168
+ if RDig::configuration.crawler.http_proxy
169
+ opts[:proxy] = RDig::configuration.crawler.http_proxy
170
+ if user = RDig::configuration.crawler.http_proxy_user
171
+ pass = RDig::configuration.crawler.http_proxy_pass
172
+ opts['Authorization'] = "Basic " + Base64.encode64("#{user}:#{pass}")
173
+ end
174
+ end
175
+ RDig::configuration.crawler.open_uri_http_options = opts
176
+ end
177
+ return RDig::configuration.crawler.open_uri_http_options
178
+ end
179
+
154
180
  end
155
181
 
156
182
  class Application
@@ -210,7 +236,6 @@ module RDig
210
236
  when '--query'
211
237
  options.query = value
212
238
  when '--version'
213
- puts "rdig, version #{RDIGVERSION}"
214
239
  exit
215
240
  else
216
241
  fail "Unknown option: #{opt}"
@@ -22,7 +22,7 @@ module RDig
22
22
  def self.extractors; @@extractors ||= [] end
23
23
  def self.extractor_instances
24
24
  @@extractor_instances ||= extractors.map { |ex_class|
25
- puts "initializing content extractor: #{ex_class}" if RDig.configuration.verbose
25
+ RDig.logger.info "initializing content extractor: #{ex_class}"
26
26
  ex_class.new(RDig.configuration.content_extraction) rescue nil
27
27
  }.compact
28
28
  end
@@ -77,8 +77,8 @@ end
77
77
  # load content extractors
78
78
  Dir["#{File.expand_path(File.dirname(__FILE__))}/content_extractors/**/*.rb"].each do |f|
79
79
  begin
80
- require f
80
+ require f
81
81
  rescue LoadError
82
- puts "could not load #{f}: #{$!}"
82
+ RDig::logger.error "could not load #{f}: #{$!}"
83
83
  end
84
84
  end
@@ -40,12 +40,11 @@ module RDig
40
40
  # all textual content contained in the root element and all it's
41
41
  # children.
42
42
  def extract_content(doc)
43
- content = ''
44
- ce = content_element(doc)
45
- content = strip_tags(strip_comments(ce.inner_html)) if ce
46
- # (ce/'h1, h2, h3, h4, h5, h6, p, li, dt, dd, td, address, option, ').each do |child|
47
- # extract_text child, content
48
- return content.strip
43
+ if ce = content_element(doc)
44
+ return strip_tags(strip_comments(ce.inner_html))
45
+ end
46
+ # return (ce.inner_text || '').gsub(Regexp.new('\s+', Regexp::MULTILINE, 'u'), ' ').strip
47
+ return ''
49
48
  end
50
49
 
51
50
  # extracts the href attributes of all a tags, except
@@ -91,7 +90,8 @@ module RDig
91
90
  Regexp::MULTILINE, 'u'), ''
92
91
  string.gsub! Regexp.new('<.+?>',
93
92
  Regexp::MULTILINE, 'u'), ''
94
- string.gsub Regexp.new('\s+', Regexp::MULTILINE, 'u'), ' '
93
+ string.gsub! Regexp.new('\s+', Regexp::MULTILINE, 'u'), ' '
94
+ string.strip
95
95
  end
96
96
 
97
97
  end
@@ -3,23 +3,30 @@ module RDig
3
3
 
4
4
  class Crawler
5
5
 
6
- def initialize
6
+ def initialize(config = RDig.config, logger = RDig.logger)
7
7
  @documents = Queue.new
8
8
  @etag_filter = ETagFilter.new
9
+ @logger = logger
10
+ @config = config
9
11
  end
10
12
 
11
13
  def run
12
- raise 'no start urls given!' if RDig.config.crawler.start_urls.empty?
13
- @indexer = Index::Indexer.new(RDig.config.index)
14
-
14
+ @indexer = Index::Indexer.new(@config.index)
15
+ crawl
16
+ ensure
17
+ @indexer.close if @indexer
18
+ end
19
+
20
+ def crawl
21
+ raise 'no start urls given!' if @config.crawler.start_urls.empty?
15
22
  # check whether we are indexing on-disk or via http
16
- url_type = RDig.config.crawler.start_urls.first =~ /^file:\/\// ? :file : :http
23
+ url_type = @config.crawler.start_urls.first =~ /^file:\/\// ? :file : :http
17
24
  chain_config = RDig.filter_chain[url_type]
18
25
 
19
26
  filterchain = UrlFilters::FilterChain.new(chain_config)
20
- RDig.config.crawler.start_urls.each { |url| add_url(url, filterchain) }
21
-
22
- num_threads = RDig.config.crawler.num_threads
27
+ @config.crawler.start_urls.each { |url| add_url(url, filterchain) }
28
+
29
+ num_threads = @config.crawler.num_threads
23
30
  group = ThreadsWait.new
24
31
  num_threads.times { |i|
25
32
  group.join_nowait Thread.new("fetcher #{i}") {
@@ -31,20 +38,19 @@ module RDig
31
38
  }
32
39
 
33
40
  # check for an empty queue every now and then
34
- sleep_interval = RDig.config.crawler.wait_before_leave
41
+ sleep_interval = @config.crawler.wait_before_leave
35
42
  begin
36
43
  sleep sleep_interval
37
44
  end until @documents.empty?
38
45
  # nothing to do any more, tell the threads to exit
39
46
  num_threads.times { @documents << :exit }
40
47
 
41
- puts "waiting for threads to finish..."
48
+ @logger.info "waiting for threads to finish..."
42
49
  group.all_waits
43
- ensure
44
- @indexer.close if @indexer
45
50
  end
46
51
 
47
52
  def process_document(doc, filterchain)
53
+ @logger.debug "processing document #{doc}"
48
54
  doc.fetch
49
55
  # add links from this document to the queue
50
56
  doc.content[:links].each { |url|
@@ -52,10 +58,14 @@ module RDig
52
58
  } unless doc.content[:links].nil?
53
59
 
54
60
  return unless @etag_filter.apply(doc)
55
- @indexer << doc if doc.needs_indexing?
61
+ add_to_index doc
56
62
  rescue
57
- puts "error processing document #{doc.uri.to_s}: #{$!}"
58
- puts "Trace: #{$!.backtrace.join("\n")}" if RDig::config.verbose
63
+ @logger.error "error processing document #{doc.uri.to_s}: #{$!}"
64
+ @logger.debug "Trace: #{$!.backtrace.join("\n")}"
65
+ end
66
+
67
+ def add_to_index(doc)
68
+ @indexer << doc if doc.needs_indexing?
59
69
  end
60
70
 
61
71
 
@@ -64,17 +74,19 @@ module RDig
64
74
  # processing
65
75
  def add_url(url, filterchain, referring_document = nil)
66
76
  return if url.nil? || url.empty?
67
- if referring_document and referring_document.uri.scheme =~ /^https?/i
68
- doc = Document.create(url, referring_document.uri)
77
+
78
+ @logger.debug "add_url #{url}"
79
+ doc = if referring_document
80
+ referring_document.create_child(url)
69
81
  else
70
- doc = Document.create(url)
82
+ Document.create(url)
71
83
  end
72
84
 
73
85
  doc = filterchain.apply(doc)
74
86
 
75
87
  if doc
76
88
  @documents << doc
77
- puts "added url #{url}" if RDig::config.verbose
89
+ @logger.debug "url #{url} survived filterchain"
78
90
  end
79
91
  rescue
80
92
  nil
@@ -9,30 +9,23 @@ module RDig
9
9
  attr_reader :content
10
10
  attr_reader :content_type
11
11
 
12
- def self.create(url, referrer_uri = nil)
13
- # a referrer is a clear enough hint to create an HttpDocument
14
- if referrer_uri && referrer_uri.scheme =~ /^https?$/i
15
- return HttpDocument.new(:url => url, :referrer => referrer_uri)
16
- end
17
-
18
- case url
19
- when /^https?:\/\//i
20
- HttpDocument.new(:url => url, :referrer => referrer_uri) if referrer_uri.nil?
21
- when /^file:\/\//i
22
- # files don't have referrers - the check for nil prevents us from being
23
- # tricked into indexing local files by file:// links in the web site
24
- # we index.
25
- FileDocument.new(:url => url) if referrer_uri.nil?
12
+ def self.create(url)
13
+ return case url
14
+ when /^https?:\/\//i
15
+ HttpDocument.new(:uri => url)
16
+ when /^file:\/\//i
17
+ FileDocument.new(:uri => url)
26
18
  end
27
19
  end
28
20
 
29
21
  # url: url of this document, may be relative to the referring doc or host.
30
22
  # referrer: uri of the document we retrieved this link from
31
23
  def initialize(args)
24
+ RDig.logger.debug "initialize: #{args.inspect}"
32
25
  begin
33
- @uri = URI.parse(args[:url])
26
+ @uri = URI.parse(args[:uri])
34
27
  rescue URI::InvalidURIError
35
- raise "Cannot create document using invalid URL: #{args[:url]}"
28
+ raise "Cannot create document using invalid URL: #{args[:uri]}"
36
29
  end
37
30
  end
38
31
 
@@ -48,6 +41,10 @@ module RDig
48
41
  !self.content.nil?
49
42
  end
50
43
 
44
+ def to_s
45
+ "#{self.class.name}, uri=#{uri}, title=#{has_content? ? title : 'not loaded yet'}"
46
+ end
47
+
51
48
  end
52
49
 
53
50
 
@@ -59,14 +56,17 @@ module RDig
59
56
  super(args)
60
57
  end
61
58
 
59
+ def create_child(uri)
60
+ FileDocument.new(:uri => uri)
61
+ end
62
+
62
63
  def self.find_files(path)
63
64
  links = []
65
+ pattern = /.+\.(#{File::FILE_EXTENSION_MIME_TYPES.keys.join('|')})$/i
64
66
  Dir.glob(File.expand_path(File.join(path, '*'))) do |filename|
67
+ RDig.logger.debug "checking file #{filename}"
65
68
  # Skip files not matching known mime types
66
- pattern = /.+\.(#{File::FILE_EXTENSION_MIME_TYPES.keys.join('|')})$/i
67
- if File.directory?(filename) || filename =~ pattern
68
- links << "file://#{filename}"
69
- end
69
+ links << "file://#{filename}" if File.directory?(filename) || filename =~ pattern
70
70
  end
71
71
  links
72
72
  end
@@ -97,20 +97,27 @@ module RDig
97
97
  #
98
98
  class HttpDocument < Document
99
99
 
100
+ # counts how far this document is away from one of the start urls. Used to limit crawling by depth.
101
+ attr_reader :depth
100
102
  attr_reader :referring_uri
101
103
  attr_reader :status
102
104
  attr_reader :etag
105
+
106
+ def create_child(uri)
107
+ HttpDocument.new(:uri => uri, :referrer => self.uri, :depth => self.depth+1) unless uri =~ /^file:\/\//i
108
+ end
103
109
 
104
110
  # url: url of this document, may be relative to the referring doc or host.
105
111
  # referrer: uri of the document we retrieved this link from
106
112
  def initialize(args={})
107
113
  super(args)
108
114
  @referring_uri = args[:referrer]
115
+ @depth = args[:depth] || 0
109
116
  end
110
117
 
111
118
  def fetch
112
- puts "fetching #{@uri.to_s}" if RDig::config.verbose
113
- open(@uri.to_s) do |doc|
119
+ RDig.logger.debug "fetching #{@uri.to_s}"
120
+ open(@uri.to_s, RDig::open_uri_http_options) do |doc|
114
121
  case doc.status.first.to_i
115
122
  when 200
116
123
  @etag = doc.meta['etag']
@@ -118,13 +125,13 @@ module RDig
118
125
  @content = ContentExtractors.process(doc.read, doc.content_type)
119
126
  @status = :success
120
127
  when 404
121
- puts "got 404 for #{@uri}"
128
+ RDig.logger.info "got 404 for #{@uri}"
122
129
  else
123
- puts "don't know what to do with response: #{doc.status.join(' : ')}"
130
+ RDig.logger.info "don't know what to do with response: #{doc.status.join(' : ')}"
124
131
  end
125
132
  end
126
133
  rescue
127
- puts "error fetching #{@uri.to_s}: #{$!}" if RDig::config.verbose
134
+ RDig.logger.warn "error fetching #{@uri.to_s}: #{$!}"
128
135
  ensure
129
136
  @content ||= {}
130
137
  end
@@ -15,7 +15,7 @@ module RDig
15
15
  end
16
16
 
17
17
  def add_to_index(document)
18
- puts "add to index: #{document.uri.to_s}" if RDig::config.verbose
18
+ RDig.logger.debug "add to index: #{document.uri.to_s}"
19
19
  @config.rewrite_uri.call(document.uri) if @config.rewrite_uri
20
20
  # all stored and tokenized, should be ferret defaults
21
21
  doc = {
@@ -43,7 +43,7 @@ module RDig
43
43
  def search(query, options={})
44
44
  result = {}
45
45
  query = query_parser.parse(query) if query.is_a?(String)
46
- puts "Query: #{query}"
46
+ RDig.logger.info "Query: #{query}"
47
47
  results = []
48
48
  searcher = ferret_searcher
49
49
  result[:hitcount] = searcher.search_each(query, options) do |doc_id, score|
@@ -80,6 +80,15 @@ module RDig
80
80
  end
81
81
  end
82
82
 
83
+ class DepthFilter
84
+ def initialize(max_depth = nil)
85
+ @max_depth = max_depth
86
+ end
87
+ def apply(document)
88
+ return document if @max_depth.nil? || document.depth <= @max_depth
89
+ end
90
+ end
91
+
83
92
 
84
93
  # base class for url inclusion / exclusion filters
85
94
  class PatternFilter
@@ -98,6 +107,7 @@ module RDig
98
107
  end
99
108
  end
100
109
  end
110
+
101
111
  class UrlExclusionFilter < PatternFilter
102
112
  # returns nil if any of the patterns matches it's URI,
103
113
  # the document itself otherwise
@@ -176,9 +186,11 @@ module RDig
176
186
  p document.uri
177
187
  end
178
188
 
189
+ # filter uris by hostname list. With a nil or empty list all documents may
190
+ # pass this filter.
179
191
  def UrlFilters.hostname_filter(document, include_hosts)
180
- return document if include_hosts.include?(document.uri.host)
181
- return nil
192
+ #RDig.logger.debug "hostname_filter: #{include_hosts}"
193
+ return document if include_hosts.nil? || include_hosts.empty? || include_hosts.include?(document.uri.host)
182
194
  end
183
195
 
184
196
  def UrlFilters.normalize_uri(document)
data/rakefile CHANGED
@@ -21,7 +21,7 @@ end
21
21
  PKG_NAME = 'rdig'
22
22
 
23
23
  # Determine the current version of the software
24
- if `ruby -Ilib ./bin/rdig --version` =~ /rdig, version ([0-9.]+)$/
24
+ if `ruby -Ilib ./bin/rdig --version` =~ /RDig version ([0-9.]+)$/
25
25
  CURRENT_VERSION = $1
26
26
  else
27
27
  CURRENT_VERSION = "0.0.0"
@@ -131,10 +131,7 @@ else
131
131
  #### Dependencies and requirements.
132
132
 
133
133
  s.add_dependency('ferret', '>= 0.10.0')
134
- # TODO: check if there is anything like 'suggested' instead of required, or
135
- # ORed dependencies...
136
- #s.add_dependency('rubyful_soup', '>= 1.0.4')
137
- s.add_dependency('hpricot', '>= 0.4')
134
+ s.add_dependency('hpricot', '>= 0.6')
138
135
  #s.requirements << ""
139
136
 
140
137
  #### Which files are to be included in this gem? Everything! (Except CVS directories.)
@@ -282,9 +279,9 @@ task :prerelease do
282
279
  announce "Release Task Testing, skipping checked-in file test"
283
280
  else
284
281
  announce "Checking for unchecked-in files..."
285
- data = `svn st`
286
- unless data =~ /^$/
287
- fail "SVN status is not clean ... do you have unchecked-in files?"
282
+ data = `git status`
283
+ unless data =~ /working directory clean/
284
+ fail "GIT status is not clean ... do you have unchecked-in files?"
288
285
  end
289
286
  announce "No outstanding checkins found ... OK"
290
287
  end
@@ -310,7 +307,8 @@ task :update_version => [:prerelease] do
310
307
  if ENV['RELTEST']
311
308
  announce "Release Task Testing, skipping commiting of new version"
312
309
  else
313
- sh %{svn commit -m "Updated to version #{PKG_VERSION}" lib/rdig.rb}
310
+ sh %{git commit -a -m "Updated to version #{PKG_VERSION}" lib/rdig.rb}
311
+ sh %{git svn dcommit}
314
312
  end
315
313
  end
316
314
  end
@@ -0,0 +1,17 @@
1
+ require 'test_helper'
2
+ class HttpDocumentTest < Test::Unit::TestCase
3
+ include TestHelper
4
+
5
+ def setup
6
+ @fixture_path = File.join(File.expand_path(File.dirname(__FILE__)), '../fixtures/')
7
+ end
8
+
9
+ def test_initialize
10
+ d = Document.create 'http://1stlineleewes.com'
11
+ assert_equal '1stlineleewes.com', d.uri.host
12
+ assert_equal '', d.uri.path
13
+ end
14
+
15
+ end
16
+
17
+
@@ -0,0 +1,38 @@
1
+ require 'test_helper'
2
+ class RDigTest < Test::Unit::TestCase
3
+ include TestHelper
4
+
5
+ def setup
6
+ RDig.configuration do |cfg|
7
+ @old_crawler_cfg = cfg.crawler.clone
8
+ cfg.log_level = :debug
9
+ cfg.log_file = 'tmp/test.log'
10
+ end
11
+ end
12
+
13
+ def teardown
14
+ RDig.configuration do |cfg|
15
+ cfg.crawler = @old_crawler_cfg
16
+ end
17
+ end
18
+
19
+ def test_proxy_config
20
+ RDig.configuration do |cfg|
21
+ cfg.crawler.http_proxy = 'http://proxy.com:8080'
22
+ end
23
+ assert_equal 'http://proxy.com:8080', RDig.open_uri_http_options[:proxy]
24
+ assert_nil RDig.open_uri_http_options['Authorization']
25
+ end
26
+
27
+ def test_proxy_auth
28
+ RDig.configuration do |cfg|
29
+ cfg.crawler.http_proxy = 'http://proxy.com:8080'
30
+ cfg.crawler.http_proxy_user = 'username'
31
+ cfg.crawler.http_proxy_pass = 'password'
32
+ end
33
+ assert_equal 'http://proxy.com:8080', RDig.open_uri_http_options[:proxy]
34
+ assert_equal "Basic dXNlcm5hbWU6cGFzc3dvcmQ=\n", RDig.open_uri_http_options['Authorization']
35
+ end
36
+ end
37
+
38
+
@@ -13,6 +13,8 @@ class SearcherTest < Test::Unit::TestCase
13
13
  cfg.crawler.wait_before_leave = 1
14
14
  cfg.index.path = index_dir
15
15
  cfg.verbose = true
16
+ cfg.log_level = :debug
17
+ cfg.log_file = 'tmp/test.log'
16
18
  end
17
19
  crawler = Crawler.new
18
20
  crawler.run
@@ -1,5 +1,5 @@
1
1
  require 'test_helper'
2
- class UrlFilterTest < Test::Unit::TestCase
2
+ class UrlFiltersTest < Test::Unit::TestCase
3
3
  include TestHelper, RDig
4
4
 
5
5
  def setup
@@ -73,24 +73,24 @@ class UrlFilterTest < Test::Unit::TestCase
73
73
  def test_fix_relative_uri
74
74
  doc = Document.create('http://test.host/dir/file.html')
75
75
  assert_equal('http://test.host/dir/another.html',
76
- UrlFilters.fix_relative_uri(Document.create('another.html', doc.uri)).uri.to_s)
76
+ UrlFilters.fix_relative_uri(doc.create_child('another.html')).uri.to_s)
77
77
  assert_equal('http://test.host/dir/../another.html',
78
- UrlFilters.fix_relative_uri(Document.create('../another.html', doc.uri)).uri.to_s)
78
+ UrlFilters.fix_relative_uri(doc.create_child('../another.html')).uri.to_s)
79
79
  assert_equal('http://test.host/dir/another.html',
80
- UrlFilters.fix_relative_uri(Document.create('/dir/another.html', doc.uri)).uri.to_s)
80
+ UrlFilters.fix_relative_uri(doc.create_child('/dir/another.html')).uri.to_s)
81
81
  assert_equal('http://test.host/dir/another.html',
82
- UrlFilters.fix_relative_uri(Document.create('http://test.host/dir/another.html', doc.uri)).uri.to_s)
82
+ UrlFilters.fix_relative_uri(doc.create_child('http://test.host/dir/another.html')).uri.to_s)
83
83
  assert_equal('HTTP://test.host/dir/another.html',
84
- UrlFilters.fix_relative_uri(Document.create('HTTP://test.host/dir/another.html', doc.uri)).uri.to_s)
84
+ UrlFilters.fix_relative_uri(doc.create_child('HTTP://test.host/dir/another.html')).uri.to_s)
85
85
  doc = Document.create('https://test.host/dir/')
86
86
  assert_equal('https://test.host/dir/another.html',
87
- UrlFilters.fix_relative_uri(Document.create('another.html', doc.uri)).uri.to_s)
87
+ UrlFilters.fix_relative_uri(doc.create_child('another.html')).uri.to_s)
88
88
  doc = Document.create('https://test.host/')
89
89
  assert_equal('https://test.host/another.html',
90
- UrlFilters.fix_relative_uri(Document.create('another.html', doc.uri)).uri.to_s)
90
+ UrlFilters.fix_relative_uri(doc.create_child('another.html')).uri.to_s)
91
91
  doc = Document.create('https://test.host')
92
92
  assert_equal('https://test.host/another.html',
93
- UrlFilters.fix_relative_uri(Document.create('another.html', doc.uri)).uri.to_s)
93
+ UrlFilters.fix_relative_uri(doc.create_child('another.html')).uri.to_s)
94
94
  end
95
95
  end
96
96
 
metadata CHANGED
@@ -1,74 +1,88 @@
1
1
  --- !ruby/object:Gem::Specification
2
- rubygems_version: 0.8.11
3
- specification_version: 1
4
2
  name: rdig
5
3
  version: !ruby/object:Gem::Version
6
- version: 0.3.4
7
- date: 2006-12-31 00:00:00 +01:00
8
- summary: Ruby based web site indexing and searching library.
9
- require_paths:
10
- - lib
11
- email: jk@jkraemer.net
12
- homepage: http://rdig.rubyforge.org/
13
- rubyforge_project: rdig
14
- description: RDig provides an HTTP crawler and content extraction utilities to help building a site search for web sites or intranets. Internally, Ferret is used for the full text indexing. After creating a config file for your site, the index can be built with a single call to rdig. For HTML page crawling, hpricot and rubyful_soup are supported.
15
- autorequire:
16
- default_executable: rdig
17
- bindir: bin
18
- has_rdoc: true
19
- required_ruby_version: !ruby/object:Gem::Version::Requirement
20
- requirements:
21
- - - ">"
22
- - !ruby/object:Gem::Version
23
- version: 0.0.0
24
- version:
4
+ version: 0.3.5
25
5
  platform: ruby
26
- signing_key:
27
- cert_chain:
28
6
  authors:
29
7
  - Jens Kraemer
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2008-02-26 00:00:00 +01:00
13
+ default_executable: rdig
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: ferret
17
+ version_requirement:
18
+ version_requirements: !ruby/object:Gem::Requirement
19
+ requirements:
20
+ - - ">="
21
+ - !ruby/object:Gem::Version
22
+ version: 0.10.0
23
+ version:
24
+ - !ruby/object:Gem::Dependency
25
+ name: hpricot
26
+ version_requirement:
27
+ version_requirements: !ruby/object:Gem::Requirement
28
+ requirements:
29
+ - - ">="
30
+ - !ruby/object:Gem::Version
31
+ version: "0.6"
32
+ version:
33
+ description: RDig provides an HTTP crawler and content extraction utilities to help building a site search for web sites or intranets. Internally, Ferret is used for the full text indexing. After creating a config file for your site, the index can be built with a single call to rdig. For HTML page crawling, hpricot and rubyful_soup are supported.
34
+ email: jk@jkraemer.net
35
+ executables:
36
+ - rdig
37
+ extensions: []
38
+
39
+ extra_rdoc_files:
40
+ - README
41
+ - CHANGES
42
+ - LICENSE
43
+ - TODO
30
44
  files:
31
45
  - bin/rdig
32
- - lib/rdig
33
- - lib/htmlentities
34
46
  - lib/rdig.rb
35
- - lib/rdig/content_extractors
36
- - lib/rdig/crawler.rb
37
- - lib/rdig/search.rb
38
- - lib/rdig/highlight.rb
39
- - lib/rdig/index.rb
47
+ - lib/rdig
40
48
  - lib/rdig/url_filters.rb
49
+ - lib/rdig/index.rb
50
+ - lib/rdig/crawler.rb
41
51
  - lib/rdig/content_extractors.rb
42
- - lib/rdig/documents.rb
43
52
  - lib/rdig/file.rb
44
- - lib/rdig/content_extractors/rubyful_soup.rb
53
+ - lib/rdig/highlight.rb
54
+ - lib/rdig/documents.rb
55
+ - lib/rdig/search.rb
56
+ - lib/rdig/content_extractors
45
57
  - lib/rdig/content_extractors/doc.rb
46
58
  - lib/rdig/content_extractors/hpricot.rb
47
59
  - lib/rdig/content_extractors/pdf.rb
48
- - lib/htmlentities/CHANGES
60
+ - lib/htmlentities
61
+ - lib/htmlentities/htmlentities.rb
49
62
  - lib/htmlentities/COPYING
63
+ - lib/htmlentities/CHANGES
50
64
  - lib/htmlentities/README
51
- - lib/htmlentities/htmlentities.rb
52
- - test/unit
53
65
  - test/fixtures
54
- - test/test_helper.rb
55
- - test/unit/etag_filter_test.rb
56
- - test/unit/url_filters_test.rb
57
- - test/unit/searcher_test.rb
58
- - test/unit/rubyful_soup_content_extractor_test.rb
59
- - test/unit/pdf_content_extractor_test.rb
60
- - test/unit/hpricot_content_extractor_test.rb
61
- - test/unit/word_content_extractor_test.rb
62
- - test/unit/file_document_test.rb
63
- - test/unit/crawler_fs_test.rb
64
- - test/fixtures/html
65
- - test/fixtures/pdf
66
66
  - test/fixtures/word
67
- - test/fixtures/html/entities.html
68
- - test/fixtures/html/simple.html
67
+ - test/fixtures/word/simple.doc
68
+ - test/fixtures/html
69
69
  - test/fixtures/html/custom_tag_selectors.html
70
+ - test/fixtures/html/simple.html
71
+ - test/fixtures/html/entities.html
72
+ - test/fixtures/pdf
70
73
  - test/fixtures/pdf/simple.pdf
71
- - test/fixtures/word/simple.doc
74
+ - test/unit
75
+ - test/unit/crawler_fs_test.rb
76
+ - test/unit/pdf_content_extractor_test.rb
77
+ - test/unit/word_content_extractor_test.rb
78
+ - test/unit/rdig_test.rb
79
+ - test/unit/http_document_test.rb
80
+ - test/unit/searcher_test.rb
81
+ - test/unit/file_document_test.rb
82
+ - test/unit/url_filters_test.rb
83
+ - test/unit/hpricot_content_extractor_test.rb
84
+ - test/unit/etag_filter_test.rb
85
+ - test/test_helper.rb
72
86
  - doc/examples
73
87
  - doc/examples/config.rb
74
88
  - LICENSE
@@ -77,41 +91,35 @@ files:
77
91
  - README
78
92
  - install.rb
79
93
  - rakefile
80
- test_files: []
81
-
94
+ has_rdoc: true
95
+ homepage: http://rdig.rubyforge.org/
96
+ post_install_message:
82
97
  rdoc_options:
83
98
  - --title
84
99
  - Rake -- Ruby Make
85
100
  - --main
86
101
  - README
87
102
  - --line-numbers
88
- extra_rdoc_files:
89
- - README
90
- - CHANGES
91
- - LICENSE
92
- - TODO
93
- executables:
94
- - rdig
95
- extensions: []
96
-
103
+ require_paths:
104
+ - lib
105
+ required_ruby_version: !ruby/object:Gem::Requirement
106
+ requirements:
107
+ - - ">="
108
+ - !ruby/object:Gem::Version
109
+ version: "0"
110
+ version:
111
+ required_rubygems_version: !ruby/object:Gem::Requirement
112
+ requirements:
113
+ - - ">="
114
+ - !ruby/object:Gem::Version
115
+ version: "0"
116
+ version:
97
117
  requirements: []
98
118
 
99
- dependencies:
100
- - !ruby/object:Gem::Dependency
101
- name: ferret
102
- version_requirement:
103
- version_requirements: !ruby/object:Gem::Version::Requirement
104
- requirements:
105
- - - ">="
106
- - !ruby/object:Gem::Version
107
- version: 0.10.0
108
- version:
109
- - !ruby/object:Gem::Dependency
110
- name: hpricot
111
- version_requirement:
112
- version_requirements: !ruby/object:Gem::Version::Requirement
113
- requirements:
114
- - - ">="
115
- - !ruby/object:Gem::Version
116
- version: "0.4"
117
- version:
119
+ rubyforge_project: rdig
120
+ rubygems_version: 1.0.1
121
+ signing_key:
122
+ specification_version: 2
123
+ summary: Ruby based web site indexing and searching library.
124
+ test_files: []
125
+
@@ -1,151 +0,0 @@
1
- begin
2
- require 'rubyful_soup'
3
- rescue LoadError
4
- require 'rubygems'
5
- require 'rubyful_soup' rescue nil
6
- end
7
-
8
- if defined?(BeautifulSoup)
9
-
10
- # override some methods concered with entity resolving
11
- # to convert them to strings
12
- class BeautifulStoneSoup
13
- # resolve unknown html entities using the htmlentities lib
14
- alias :orig_unknown_entityref :unknown_entityref
15
- def unknown_entityref(ref)
16
- if HTMLEntities::MAP.has_key?(ref)
17
- handle_data [HTMLEntities::MAP[ref]].pack('U')
18
- else
19
- orig_unknown_entityref ref
20
- end
21
- end
22
-
23
- # resolve numeric entities to utf8
24
- def handle_charref(ref)
25
- handle_data( ref.gsub(/([0-9]{1,7})/) {
26
- [$1.to_i].pack('U')
27
- }.gsub(/x([0-9a-f]{1,6})/i) {
28
- [$1.to_i(16)].pack('U')
29
- } )
30
- end
31
- end
32
-
33
- module RDig
34
- module ContentExtractors
35
-
36
- # extracts title, content and links from html documents
37
- class RubyfulSoupContentExtractor < ContentExtractor
38
-
39
- def initialize(config)
40
- super(config.rubyful_soup)
41
- # if not configured, refuse to handle any content:
42
- @pattern = /^(text\/(html|xml)|application\/(xhtml\+xml|xml))/ if config.rubyful_soup
43
- end
44
-
45
- # returns:
46
- # { :content => 'extracted clear text',
47
- # :meta => { :title => 'Title' },
48
- # :links => [array of urls] }
49
- def process(content)
50
- result = { }
51
- tag_soup = BeautifulSoup.new(content)
52
- result[:title] = extract_title(tag_soup)
53
- result[:links] = extract_links(tag_soup)
54
- result[:content] = extract_content(tag_soup)
55
- return result
56
- end
57
-
58
- # Extracts textual content from the HTML tree.
59
- #
60
- # - First, the root element to use is determined using the
61
- # +content_element+ method, which itself uses the content_tag_selector
62
- # from RDig.configuration.
63
- # - Then, this element is processed by +extract_text+, which will give
64
- # all textual content contained in the root element and all it's
65
- # children.
66
- def extract_content(tag_soup)
67
- content = ''
68
- ce = content_element(tag_soup)
69
- ce.children { |child|
70
- extract_text(child, content)
71
- } unless ce.nil?
72
- return content.strip
73
- end
74
-
75
- # extracts the href attributes of all a tags, except
76
- # internal links like <a href="#top">
77
- def extract_links(tagsoup)
78
- tagsoup.find_all('a').map { |link|
79
- CGI.unescapeHTML(link['href']) if (link['href'] && link['href'] !~ /^#/)
80
- }.compact
81
- end
82
-
83
- # Extracts the title from the given html tree
84
- def extract_title(tagsoup)
85
- the_title_tag = title_tag(tagsoup)
86
- if the_title_tag.is_a? String
87
- the_title_tag
88
- else
89
- title = ''
90
- extract_text(the_title_tag, title)
91
- title.strip
92
- end
93
- end
94
-
95
- # Recursively extracts all text contained in the given element,
96
- # and appends it to content.
97
- def extract_text(element, content='')
98
- return nil if element.nil?
99
- if element.is_a? NavigableString
100
- value = strip_comments(element)
101
- value.strip!
102
- unless value.empty?
103
- content << value
104
- content << ' '
105
- end
106
- elsif element.string # it's a Tag, and it has some content string
107
- # skip inline scripts and styles
108
- return nil if element.name =~ /^(script|style)$/i
109
- value = element.string.strip
110
- unless value.empty?
111
- content << value
112
- content << ' '
113
- end
114
- else
115
- element.children { |child|
116
- extract_text(child, content)
117
- }
118
- end
119
- end
120
-
121
- # Returns the element to extract the title from.
122
- #
123
- # This may return a string, e.g. an attribute value selected from a meta
124
- # tag, too.
125
- def title_tag(tagsoup)
126
- if @config.title_tag_selector
127
- @config.title_tag_selector.call(tagsoup)
128
- else
129
- tagsoup.html.head.title
130
- end
131
- end
132
-
133
- # Retrieve the root element to extract document content from
134
- def content_element(tagsoup)
135
- if @config.content_tag_selector
136
- @config.content_tag_selector.call(tagsoup)
137
- else
138
- tagsoup.html.body
139
- end
140
- end
141
-
142
- # Return the given string minus all html comments
143
- def strip_comments(string)
144
- string.gsub(Regexp.new('<!--.*?-->', Regexp::MULTILINE, 'u'), '')
145
- end
146
- end
147
-
148
- end
149
- end
150
-
151
- end
@@ -1,83 +0,0 @@
1
- require 'test_helper'
2
- class RubyfulSoupContentExtractorTest < Test::Unit::TestCase
3
- include TestHelper
4
-
5
- def setup
6
- @config = OpenStruct.new(
7
- :content_tag_selector => lambda { |tagsoup|
8
- tagsoup.html.body
9
- },
10
- :title_tag_selector => lambda { |tagsoup|
11
- tagsoup.html.head.title
12
- })
13
- @extractor = ContentExtractors::RubyfulSoupContentExtractor.new(OpenStruct.new(:rubyful_soup => @config))
14
- @nbsp = [160].pack('U') # non breaking space
15
- end
16
-
17
- def test_can_do
18
- assert !@extractor.can_do('application/pdf')
19
- assert !@extractor.can_do('application/msword')
20
- assert @extractor.can_do('text/html')
21
- assert @extractor.can_do('text/xml')
22
- assert @extractor.can_do('application/xml')
23
- assert @extractor.can_do('application/xhtml+xml')
24
- end
25
-
26
- def test_simple
27
- result = ContentExtractors.process(html_doc('simple'), 'text/html')
28
- assert_not_nil result
29
- assert_equal 'Sample Title', result[:title]
30
- assert_not_nil result[:content]
31
- assert_not_nil result[:links]
32
- assert_equal 1, result[:links].size
33
- assert_equal 'A Link Affe Some sample text Lorem ipsum', result[:content]
34
- assert_equal 'http://test.host/affe.html', result[:links].first
35
- end
36
-
37
- def test_entities
38
- result = @extractor.process(html_doc('entities'))
39
- assert_equal 'Sample & Title', result[:title]
40
- assert_equal 'http://test.host/affe.html?b=a&c=d', result[:links].first
41
- assert_equal 'http://test.host/affe2.html?b=a&c=d', result[:links].last
42
- assert_equal "Some > Links don't#{@nbsp}break me! Affe Affe Ümläuts heiß hier ß", result[:content]
43
- end
44
-
45
- def test_custom_content_element
46
- @config.title_tag_selector = lambda do |tagsoup|
47
- tagsoup.find('h1', :attrs => { 'class', 'title' })
48
- end
49
- @config.content_tag_selector = lambda do |tagsoup|
50
- tagsoup.find('div', :attrs => { 'id', 'content' })
51
- end
52
- result = @extractor.process(html_doc('custom_tag_selectors'))
53
- assert_equal 'Sample Title in h1', result[:title]
54
- assert_equal 'Affe Real content is here.', result[:content]
55
- # check if links are collected outside the content tag, too:
56
- assert_equal 3, result[:links].size
57
- assert_equal 'http://test.host/outside.html', result[:links].first
58
- assert_equal '/inside.html', result[:links][1]
59
- assert_equal '/footer.html', result[:links][2]
60
- end
61
-
62
-
63
- def test_title_from_dcmeta
64
- @config.title_tag_selector = lambda do |tagsoup|
65
- tagsoup.find('meta', :attrs => { 'name', 'DC.title' })['content']
66
- end
67
- result = @extractor.process(html_doc('custom_tag_selectors'))
68
- assert_equal 'Title from DC meta data', result[:title]
69
- end
70
-
71
- def test_preprocessed_title
72
- @config.title_tag_selector = lambda do |tagsoup|
73
- title = tagsoup.find('meta', :attrs => { 'name', 'DC.title' })['content']
74
- # use only a portion of the title tag's contents if it matches our
75
- # regexp:
76
- title =~ /^(.*)meta data$/ ? $1.strip : title.strip
77
- end
78
- result = @extractor.process(html_doc('custom_tag_selectors'))
79
- assert_equal 'Title from DC', result[:title]
80
- end
81
-
82
- end
83
-