rdig 0.3.4 → 0.3.5
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGES +8 -0
- data/doc/examples/config.rb +25 -25
- data/lib/rdig.rb +42 -17
- data/lib/rdig/content_extractors.rb +3 -3
- data/lib/rdig/content_extractors/hpricot.rb +7 -7
- data/lib/rdig/crawler.rb +31 -19
- data/lib/rdig/documents.rb +32 -25
- data/lib/rdig/index.rb +1 -1
- data/lib/rdig/search.rb +1 -1
- data/lib/rdig/url_filters.rb +14 -2
- data/rakefile +7 -9
- data/test/unit/http_document_test.rb +17 -0
- data/test/unit/rdig_test.rb +38 -0
- data/test/unit/searcher_test.rb +2 -0
- data/test/unit/url_filters_test.rb +9 -9
- metadata +88 -80
- data/lib/rdig/content_extractors/rubyful_soup.rb +0 -151
- data/test/unit/rubyful_soup_content_extractor_test.rb +0 -83
data/CHANGES
CHANGED
@@ -1,3 +1,11 @@
|
|
1
|
+
0.3.5
|
2
|
+
- Add max_depth option to crawler configuration for limiting the crawl to a
|
3
|
+
specific depth
|
4
|
+
- add support for http proxies including basic authentication
|
5
|
+
- remove rubyfoul_soup support
|
6
|
+
|
7
|
+
0.3.4
|
8
|
+
|
1
9
|
0.3.2
|
2
10
|
- make RDig compatible with Ferret 0.10.x
|
3
11
|
- won't work any more with Ferret 0.9.x and before
|
data/doc/examples/config.rb
CHANGED
@@ -2,6 +2,12 @@ RDig.configuration do |cfg|
|
|
2
2
|
|
3
3
|
##################################################################
|
4
4
|
# options you really should set
|
5
|
+
|
6
|
+
# log file location
|
7
|
+
cfg.log_file = '/tmp/rdig.log'
|
8
|
+
|
9
|
+
# log level, set to :debug, :info, :warn or :error
|
10
|
+
cfg.log_level = :info
|
5
11
|
|
6
12
|
# provide one or more URLs for the crawler to start from
|
7
13
|
cfg.crawler.start_urls = [ 'http://www.example.com/' ]
|
@@ -29,10 +35,11 @@ RDig.configuration do |cfg|
|
|
29
35
|
# content extraction options
|
30
36
|
cfg.content_extraction = OpenStruct.new(
|
31
37
|
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
38
|
+
# HPRICOT configuration
|
39
|
+
# hpricot is the html parsing lib used by RDig. See
|
40
|
+
# http://code.whytheluckystiff.net/hpricot for usage information.
|
41
|
+
# Any code blocks given for content selection will receive an Hpricot instance
|
42
|
+
# containing the full page content when called.
|
36
43
|
:hpricot => OpenStruct.new(
|
37
44
|
# css selector for the element containing the page title
|
38
45
|
:title_tag_selector => 'title',
|
@@ -42,26 +49,6 @@ RDig.configuration do |cfg|
|
|
42
49
|
# might also be a proc returning either an element or a string:
|
43
50
|
# :content_tag_selector => lambda { |hpricot_doc| ... }
|
44
51
|
)
|
45
|
-
|
46
|
-
# RUBYFUL SOUP
|
47
|
-
# This is a powerful, but somewhat slow, ruby-only html parsing lib which was
|
48
|
-
# RDig's default html parser up to version 0.3.2. To use it, comment the
|
49
|
-
# hpricot config above, and uncomment the following:
|
50
|
-
#
|
51
|
-
# :rubyful_soup => OpenStruct.new(
|
52
|
-
# # provide a method that returns the title of an html document
|
53
|
-
# # this method may either return a tag to extract the title from,
|
54
|
-
# # or a ready-to-index string.
|
55
|
-
# :content_tag_selector => lambda { |tagsoup|
|
56
|
-
# tagsoup.html.body
|
57
|
-
# },
|
58
|
-
# # provide a method that selects the tag containing the page content you
|
59
|
-
# # want to index. Useful to avoid indexing common elements like navigation
|
60
|
-
# # and page footers for every page.
|
61
|
-
# :title_tag_selector => lambda { |tagsoup|
|
62
|
-
# tagsoup.html.head.title
|
63
|
-
# }
|
64
|
-
# )
|
65
52
|
)
|
66
53
|
|
67
54
|
# crawler options
|
@@ -95,12 +82,25 @@ RDig.configuration do |cfg|
|
|
95
82
|
# crawls on slow sites. Don't set to 0, even when crawling a local fs.
|
96
83
|
# cfg.crawler.wait_before_leave = 10
|
97
84
|
|
85
|
+
# limit the crawling depth. Default: nil (unlimited)
|
86
|
+
# Set to 0 to only index the start_urls.
|
87
|
+
# cfg.crawler.max_depth = nil
|
88
|
+
|
89
|
+
# http proxy configuration
|
90
|
+
# proxy url
|
91
|
+
# cfg.crawler.http_proxy = nil
|
92
|
+
#
|
93
|
+
# proxy username
|
94
|
+
# cfg.crawler.http_proxy_user = nil
|
95
|
+
# proxy password
|
96
|
+
# cfg.crawler.http_proxy_pass = nil
|
97
|
+
|
98
98
|
# indexer options
|
99
99
|
|
100
100
|
# create a new index on each run. Will append to the index if false. Use when
|
101
101
|
# building a single index from multiple runs, e.g. one across a website and the
|
102
102
|
# other a tree in a local file system
|
103
|
-
#
|
103
|
+
# cfg.index.create = true
|
104
104
|
|
105
105
|
# rewrite document uris before indexing them. This is useful if you're
|
106
106
|
# indexing on disk, but the documents should be accessible via http, e.g. from
|
data/lib/rdig.rb
CHANGED
@@ -24,7 +24,7 @@
|
|
24
24
|
#++
|
25
25
|
#
|
26
26
|
|
27
|
-
RDIGVERSION = '0.3.
|
27
|
+
RDIGVERSION = '0.3.5'
|
28
28
|
|
29
29
|
|
30
30
|
require 'thread'
|
@@ -39,6 +39,8 @@ require 'net/http'
|
|
39
39
|
require 'getoptlong'
|
40
40
|
require 'tempfile'
|
41
41
|
require 'open-uri'
|
42
|
+
require 'logger'
|
43
|
+
require 'base64'
|
42
44
|
|
43
45
|
begin
|
44
46
|
require 'ferret'
|
@@ -69,10 +71,11 @@ module RDig
|
|
69
71
|
:scheme_filter_http,
|
70
72
|
:fix_relative_uri,
|
71
73
|
:normalize_uri,
|
74
|
+
{ RDig::UrlFilters::DepthFilter => :max_depth },
|
72
75
|
{ :hostname_filter => :include_hosts },
|
73
76
|
{ RDig::UrlFilters::UrlInclusionFilter => :include_documents },
|
74
77
|
{ RDig::UrlFilters::UrlExclusionFilter => :exclude_documents },
|
75
|
-
RDig::UrlFilters::VisitedUrlFilter
|
78
|
+
RDig::UrlFilters::VisitedUrlFilter
|
76
79
|
],
|
77
80
|
# filter chain for file system crawling
|
78
81
|
:file => [
|
@@ -103,6 +106,8 @@ module RDig
|
|
103
106
|
yield configuration
|
104
107
|
else
|
105
108
|
@config ||= OpenStruct.new(
|
109
|
+
:log_file => '/tmp/rdig.log',
|
110
|
+
:log_level => :warn,
|
106
111
|
:crawler => OpenStruct.new(
|
107
112
|
:start_urls => [ "http://localhost:3000/" ],
|
108
113
|
:include_hosts => [ "localhost" ],
|
@@ -111,7 +116,11 @@ module RDig
|
|
111
116
|
:index_document => nil,
|
112
117
|
:num_threads => 2,
|
113
118
|
:max_redirects => 5,
|
114
|
-
:
|
119
|
+
:max_depth => nil,
|
120
|
+
:wait_before_leave => 10,
|
121
|
+
:http_proxy => nil,
|
122
|
+
:http_proxy_user => nil,
|
123
|
+
:http_proxy_pass => nil
|
115
124
|
),
|
116
125
|
:content_extraction => OpenStruct.new(
|
117
126
|
# settings for html content extraction (hpricot)
|
@@ -124,19 +133,6 @@ module RDig
|
|
124
133
|
# might also be a proc returning either an element or a string:
|
125
134
|
# :content_tag_selector => lambda { |hpricot_doc| ... }
|
126
135
|
)
|
127
|
-
#,
|
128
|
-
# # settings for html content extraction (RubyfulSoup)
|
129
|
-
# :rubyful_soup => OpenStruct.new(
|
130
|
-
# # select the html element that contains the content to index
|
131
|
-
# # by default, we index all inside the body tag:
|
132
|
-
# :content_tag_selector => lambda { |tagsoup|
|
133
|
-
# tagsoup.html.body
|
134
|
-
# },
|
135
|
-
# # select the html element containing the title
|
136
|
-
# :title_tag_selector => lambda { |tagsoup|
|
137
|
-
# tagsoup.html.head.title
|
138
|
-
# }
|
139
|
-
# )
|
140
136
|
),
|
141
137
|
:index => OpenStruct.new(
|
142
138
|
:path => "index/",
|
@@ -151,6 +147,36 @@ module RDig
|
|
151
147
|
end
|
152
148
|
alias config configuration
|
153
149
|
|
150
|
+
def logger
|
151
|
+
@logger ||= create_logger
|
152
|
+
end
|
153
|
+
|
154
|
+
def logger=(log)
|
155
|
+
@logger = log
|
156
|
+
end
|
157
|
+
|
158
|
+
def create_logger
|
159
|
+
l = Logger.new(RDig.config.log_file)
|
160
|
+
l.level = Logger.const_get RDig.config.log_level.to_s.upcase rescue Logger::WARN
|
161
|
+
return l
|
162
|
+
end
|
163
|
+
|
164
|
+
# returns http options for open_uri if configured
|
165
|
+
def open_uri_http_options
|
166
|
+
unless RDig::configuration.crawler.open_uri_http_options
|
167
|
+
opts = {}
|
168
|
+
if RDig::configuration.crawler.http_proxy
|
169
|
+
opts[:proxy] = RDig::configuration.crawler.http_proxy
|
170
|
+
if user = RDig::configuration.crawler.http_proxy_user
|
171
|
+
pass = RDig::configuration.crawler.http_proxy_pass
|
172
|
+
opts['Authorization'] = "Basic " + Base64.encode64("#{user}:#{pass}")
|
173
|
+
end
|
174
|
+
end
|
175
|
+
RDig::configuration.crawler.open_uri_http_options = opts
|
176
|
+
end
|
177
|
+
return RDig::configuration.crawler.open_uri_http_options
|
178
|
+
end
|
179
|
+
|
154
180
|
end
|
155
181
|
|
156
182
|
class Application
|
@@ -210,7 +236,6 @@ module RDig
|
|
210
236
|
when '--query'
|
211
237
|
options.query = value
|
212
238
|
when '--version'
|
213
|
-
puts "rdig, version #{RDIGVERSION}"
|
214
239
|
exit
|
215
240
|
else
|
216
241
|
fail "Unknown option: #{opt}"
|
@@ -22,7 +22,7 @@ module RDig
|
|
22
22
|
def self.extractors; @@extractors ||= [] end
|
23
23
|
def self.extractor_instances
|
24
24
|
@@extractor_instances ||= extractors.map { |ex_class|
|
25
|
-
|
25
|
+
RDig.logger.info "initializing content extractor: #{ex_class}"
|
26
26
|
ex_class.new(RDig.configuration.content_extraction) rescue nil
|
27
27
|
}.compact
|
28
28
|
end
|
@@ -77,8 +77,8 @@ end
|
|
77
77
|
# load content extractors
|
78
78
|
Dir["#{File.expand_path(File.dirname(__FILE__))}/content_extractors/**/*.rb"].each do |f|
|
79
79
|
begin
|
80
|
-
require f
|
80
|
+
require f
|
81
81
|
rescue LoadError
|
82
|
-
|
82
|
+
RDig::logger.error "could not load #{f}: #{$!}"
|
83
83
|
end
|
84
84
|
end
|
@@ -40,12 +40,11 @@ module RDig
|
|
40
40
|
# all textual content contained in the root element and all it's
|
41
41
|
# children.
|
42
42
|
def extract_content(doc)
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
#
|
47
|
-
|
48
|
-
return content.strip
|
43
|
+
if ce = content_element(doc)
|
44
|
+
return strip_tags(strip_comments(ce.inner_html))
|
45
|
+
end
|
46
|
+
# return (ce.inner_text || '').gsub(Regexp.new('\s+', Regexp::MULTILINE, 'u'), ' ').strip
|
47
|
+
return ''
|
49
48
|
end
|
50
49
|
|
51
50
|
# extracts the href attributes of all a tags, except
|
@@ -91,7 +90,8 @@ module RDig
|
|
91
90
|
Regexp::MULTILINE, 'u'), ''
|
92
91
|
string.gsub! Regexp.new('<.+?>',
|
93
92
|
Regexp::MULTILINE, 'u'), ''
|
94
|
-
string.gsub Regexp.new('\s+', Regexp::MULTILINE, 'u'), ' '
|
93
|
+
string.gsub! Regexp.new('\s+', Regexp::MULTILINE, 'u'), ' '
|
94
|
+
string.strip
|
95
95
|
end
|
96
96
|
|
97
97
|
end
|
data/lib/rdig/crawler.rb
CHANGED
@@ -3,23 +3,30 @@ module RDig
|
|
3
3
|
|
4
4
|
class Crawler
|
5
5
|
|
6
|
-
def initialize
|
6
|
+
def initialize(config = RDig.config, logger = RDig.logger)
|
7
7
|
@documents = Queue.new
|
8
8
|
@etag_filter = ETagFilter.new
|
9
|
+
@logger = logger
|
10
|
+
@config = config
|
9
11
|
end
|
10
12
|
|
11
13
|
def run
|
12
|
-
|
13
|
-
|
14
|
-
|
14
|
+
@indexer = Index::Indexer.new(@config.index)
|
15
|
+
crawl
|
16
|
+
ensure
|
17
|
+
@indexer.close if @indexer
|
18
|
+
end
|
19
|
+
|
20
|
+
def crawl
|
21
|
+
raise 'no start urls given!' if @config.crawler.start_urls.empty?
|
15
22
|
# check whether we are indexing on-disk or via http
|
16
|
-
url_type =
|
23
|
+
url_type = @config.crawler.start_urls.first =~ /^file:\/\// ? :file : :http
|
17
24
|
chain_config = RDig.filter_chain[url_type]
|
18
25
|
|
19
26
|
filterchain = UrlFilters::FilterChain.new(chain_config)
|
20
|
-
|
21
|
-
|
22
|
-
num_threads =
|
27
|
+
@config.crawler.start_urls.each { |url| add_url(url, filterchain) }
|
28
|
+
|
29
|
+
num_threads = @config.crawler.num_threads
|
23
30
|
group = ThreadsWait.new
|
24
31
|
num_threads.times { |i|
|
25
32
|
group.join_nowait Thread.new("fetcher #{i}") {
|
@@ -31,20 +38,19 @@ module RDig
|
|
31
38
|
}
|
32
39
|
|
33
40
|
# check for an empty queue every now and then
|
34
|
-
sleep_interval =
|
41
|
+
sleep_interval = @config.crawler.wait_before_leave
|
35
42
|
begin
|
36
43
|
sleep sleep_interval
|
37
44
|
end until @documents.empty?
|
38
45
|
# nothing to do any more, tell the threads to exit
|
39
46
|
num_threads.times { @documents << :exit }
|
40
47
|
|
41
|
-
|
48
|
+
@logger.info "waiting for threads to finish..."
|
42
49
|
group.all_waits
|
43
|
-
ensure
|
44
|
-
@indexer.close if @indexer
|
45
50
|
end
|
46
51
|
|
47
52
|
def process_document(doc, filterchain)
|
53
|
+
@logger.debug "processing document #{doc}"
|
48
54
|
doc.fetch
|
49
55
|
# add links from this document to the queue
|
50
56
|
doc.content[:links].each { |url|
|
@@ -52,10 +58,14 @@ module RDig
|
|
52
58
|
} unless doc.content[:links].nil?
|
53
59
|
|
54
60
|
return unless @etag_filter.apply(doc)
|
55
|
-
|
61
|
+
add_to_index doc
|
56
62
|
rescue
|
57
|
-
|
58
|
-
|
63
|
+
@logger.error "error processing document #{doc.uri.to_s}: #{$!}"
|
64
|
+
@logger.debug "Trace: #{$!.backtrace.join("\n")}"
|
65
|
+
end
|
66
|
+
|
67
|
+
def add_to_index(doc)
|
68
|
+
@indexer << doc if doc.needs_indexing?
|
59
69
|
end
|
60
70
|
|
61
71
|
|
@@ -64,17 +74,19 @@ module RDig
|
|
64
74
|
# processing
|
65
75
|
def add_url(url, filterchain, referring_document = nil)
|
66
76
|
return if url.nil? || url.empty?
|
67
|
-
|
68
|
-
|
77
|
+
|
78
|
+
@logger.debug "add_url #{url}"
|
79
|
+
doc = if referring_document
|
80
|
+
referring_document.create_child(url)
|
69
81
|
else
|
70
|
-
|
82
|
+
Document.create(url)
|
71
83
|
end
|
72
84
|
|
73
85
|
doc = filterchain.apply(doc)
|
74
86
|
|
75
87
|
if doc
|
76
88
|
@documents << doc
|
77
|
-
|
89
|
+
@logger.debug "url #{url} survived filterchain"
|
78
90
|
end
|
79
91
|
rescue
|
80
92
|
nil
|
data/lib/rdig/documents.rb
CHANGED
@@ -9,30 +9,23 @@ module RDig
|
|
9
9
|
attr_reader :content
|
10
10
|
attr_reader :content_type
|
11
11
|
|
12
|
-
def self.create(url
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
case url
|
19
|
-
when /^https?:\/\//i
|
20
|
-
HttpDocument.new(:url => url, :referrer => referrer_uri) if referrer_uri.nil?
|
21
|
-
when /^file:\/\//i
|
22
|
-
# files don't have referrers - the check for nil prevents us from being
|
23
|
-
# tricked into indexing local files by file:// links in the web site
|
24
|
-
# we index.
|
25
|
-
FileDocument.new(:url => url) if referrer_uri.nil?
|
12
|
+
def self.create(url)
|
13
|
+
return case url
|
14
|
+
when /^https?:\/\//i
|
15
|
+
HttpDocument.new(:uri => url)
|
16
|
+
when /^file:\/\//i
|
17
|
+
FileDocument.new(:uri => url)
|
26
18
|
end
|
27
19
|
end
|
28
20
|
|
29
21
|
# url: url of this document, may be relative to the referring doc or host.
|
30
22
|
# referrer: uri of the document we retrieved this link from
|
31
23
|
def initialize(args)
|
24
|
+
RDig.logger.debug "initialize: #{args.inspect}"
|
32
25
|
begin
|
33
|
-
@uri = URI.parse(args[:
|
26
|
+
@uri = URI.parse(args[:uri])
|
34
27
|
rescue URI::InvalidURIError
|
35
|
-
raise "Cannot create document using invalid URL: #{args[:
|
28
|
+
raise "Cannot create document using invalid URL: #{args[:uri]}"
|
36
29
|
end
|
37
30
|
end
|
38
31
|
|
@@ -48,6 +41,10 @@ module RDig
|
|
48
41
|
!self.content.nil?
|
49
42
|
end
|
50
43
|
|
44
|
+
def to_s
|
45
|
+
"#{self.class.name}, uri=#{uri}, title=#{has_content? ? title : 'not loaded yet'}"
|
46
|
+
end
|
47
|
+
|
51
48
|
end
|
52
49
|
|
53
50
|
|
@@ -59,14 +56,17 @@ module RDig
|
|
59
56
|
super(args)
|
60
57
|
end
|
61
58
|
|
59
|
+
def create_child(uri)
|
60
|
+
FileDocument.new(:uri => uri)
|
61
|
+
end
|
62
|
+
|
62
63
|
def self.find_files(path)
|
63
64
|
links = []
|
65
|
+
pattern = /.+\.(#{File::FILE_EXTENSION_MIME_TYPES.keys.join('|')})$/i
|
64
66
|
Dir.glob(File.expand_path(File.join(path, '*'))) do |filename|
|
67
|
+
RDig.logger.debug "checking file #{filename}"
|
65
68
|
# Skip files not matching known mime types
|
66
|
-
|
67
|
-
if File.directory?(filename) || filename =~ pattern
|
68
|
-
links << "file://#{filename}"
|
69
|
-
end
|
69
|
+
links << "file://#{filename}" if File.directory?(filename) || filename =~ pattern
|
70
70
|
end
|
71
71
|
links
|
72
72
|
end
|
@@ -97,20 +97,27 @@ module RDig
|
|
97
97
|
#
|
98
98
|
class HttpDocument < Document
|
99
99
|
|
100
|
+
# counts how far this document is away from one of the start urls. Used to limit crawling by depth.
|
101
|
+
attr_reader :depth
|
100
102
|
attr_reader :referring_uri
|
101
103
|
attr_reader :status
|
102
104
|
attr_reader :etag
|
105
|
+
|
106
|
+
def create_child(uri)
|
107
|
+
HttpDocument.new(:uri => uri, :referrer => self.uri, :depth => self.depth+1) unless uri =~ /^file:\/\//i
|
108
|
+
end
|
103
109
|
|
104
110
|
# url: url of this document, may be relative to the referring doc or host.
|
105
111
|
# referrer: uri of the document we retrieved this link from
|
106
112
|
def initialize(args={})
|
107
113
|
super(args)
|
108
114
|
@referring_uri = args[:referrer]
|
115
|
+
@depth = args[:depth] || 0
|
109
116
|
end
|
110
117
|
|
111
118
|
def fetch
|
112
|
-
|
113
|
-
open(@uri.to_s) do |doc|
|
119
|
+
RDig.logger.debug "fetching #{@uri.to_s}"
|
120
|
+
open(@uri.to_s, RDig::open_uri_http_options) do |doc|
|
114
121
|
case doc.status.first.to_i
|
115
122
|
when 200
|
116
123
|
@etag = doc.meta['etag']
|
@@ -118,13 +125,13 @@ module RDig
|
|
118
125
|
@content = ContentExtractors.process(doc.read, doc.content_type)
|
119
126
|
@status = :success
|
120
127
|
when 404
|
121
|
-
|
128
|
+
RDig.logger.info "got 404 for #{@uri}"
|
122
129
|
else
|
123
|
-
|
130
|
+
RDig.logger.info "don't know what to do with response: #{doc.status.join(' : ')}"
|
124
131
|
end
|
125
132
|
end
|
126
133
|
rescue
|
127
|
-
|
134
|
+
RDig.logger.warn "error fetching #{@uri.to_s}: #{$!}"
|
128
135
|
ensure
|
129
136
|
@content ||= {}
|
130
137
|
end
|
data/lib/rdig/index.rb
CHANGED
@@ -15,7 +15,7 @@ module RDig
|
|
15
15
|
end
|
16
16
|
|
17
17
|
def add_to_index(document)
|
18
|
-
|
18
|
+
RDig.logger.debug "add to index: #{document.uri.to_s}"
|
19
19
|
@config.rewrite_uri.call(document.uri) if @config.rewrite_uri
|
20
20
|
# all stored and tokenized, should be ferret defaults
|
21
21
|
doc = {
|
data/lib/rdig/search.rb
CHANGED
@@ -43,7 +43,7 @@ module RDig
|
|
43
43
|
def search(query, options={})
|
44
44
|
result = {}
|
45
45
|
query = query_parser.parse(query) if query.is_a?(String)
|
46
|
-
|
46
|
+
RDig.logger.info "Query: #{query}"
|
47
47
|
results = []
|
48
48
|
searcher = ferret_searcher
|
49
49
|
result[:hitcount] = searcher.search_each(query, options) do |doc_id, score|
|
data/lib/rdig/url_filters.rb
CHANGED
@@ -80,6 +80,15 @@ module RDig
|
|
80
80
|
end
|
81
81
|
end
|
82
82
|
|
83
|
+
class DepthFilter
|
84
|
+
def initialize(max_depth = nil)
|
85
|
+
@max_depth = max_depth
|
86
|
+
end
|
87
|
+
def apply(document)
|
88
|
+
return document if @max_depth.nil? || document.depth <= @max_depth
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
83
92
|
|
84
93
|
# base class for url inclusion / exclusion filters
|
85
94
|
class PatternFilter
|
@@ -98,6 +107,7 @@ module RDig
|
|
98
107
|
end
|
99
108
|
end
|
100
109
|
end
|
110
|
+
|
101
111
|
class UrlExclusionFilter < PatternFilter
|
102
112
|
# returns nil if any of the patterns matches it's URI,
|
103
113
|
# the document itself otherwise
|
@@ -176,9 +186,11 @@ module RDig
|
|
176
186
|
p document.uri
|
177
187
|
end
|
178
188
|
|
189
|
+
# filter uris by hostname list. With a nil or empty list all documents may
|
190
|
+
# pass this filter.
|
179
191
|
def UrlFilters.hostname_filter(document, include_hosts)
|
180
|
-
|
181
|
-
return nil
|
192
|
+
#RDig.logger.debug "hostname_filter: #{include_hosts}"
|
193
|
+
return document if include_hosts.nil? || include_hosts.empty? || include_hosts.include?(document.uri.host)
|
182
194
|
end
|
183
195
|
|
184
196
|
def UrlFilters.normalize_uri(document)
|
data/rakefile
CHANGED
@@ -21,7 +21,7 @@ end
|
|
21
21
|
PKG_NAME = 'rdig'
|
22
22
|
|
23
23
|
# Determine the current version of the software
|
24
|
-
if `ruby -Ilib ./bin/rdig --version` =~ /
|
24
|
+
if `ruby -Ilib ./bin/rdig --version` =~ /RDig version ([0-9.]+)$/
|
25
25
|
CURRENT_VERSION = $1
|
26
26
|
else
|
27
27
|
CURRENT_VERSION = "0.0.0"
|
@@ -131,10 +131,7 @@ else
|
|
131
131
|
#### Dependencies and requirements.
|
132
132
|
|
133
133
|
s.add_dependency('ferret', '>= 0.10.0')
|
134
|
-
|
135
|
-
# ORed dependencies...
|
136
|
-
#s.add_dependency('rubyful_soup', '>= 1.0.4')
|
137
|
-
s.add_dependency('hpricot', '>= 0.4')
|
134
|
+
s.add_dependency('hpricot', '>= 0.6')
|
138
135
|
#s.requirements << ""
|
139
136
|
|
140
137
|
#### Which files are to be included in this gem? Everything! (Except CVS directories.)
|
@@ -282,9 +279,9 @@ task :prerelease do
|
|
282
279
|
announce "Release Task Testing, skipping checked-in file test"
|
283
280
|
else
|
284
281
|
announce "Checking for unchecked-in files..."
|
285
|
-
data = `
|
286
|
-
unless data =~
|
287
|
-
fail "
|
282
|
+
data = `git status`
|
283
|
+
unless data =~ /working directory clean/
|
284
|
+
fail "GIT status is not clean ... do you have unchecked-in files?"
|
288
285
|
end
|
289
286
|
announce "No outstanding checkins found ... OK"
|
290
287
|
end
|
@@ -310,7 +307,8 @@ task :update_version => [:prerelease] do
|
|
310
307
|
if ENV['RELTEST']
|
311
308
|
announce "Release Task Testing, skipping commiting of new version"
|
312
309
|
else
|
313
|
-
sh %{
|
310
|
+
sh %{git commit -a -m "Updated to version #{PKG_VERSION}" lib/rdig.rb}
|
311
|
+
sh %{git svn dcommit}
|
314
312
|
end
|
315
313
|
end
|
316
314
|
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
class HttpDocumentTest < Test::Unit::TestCase
|
3
|
+
include TestHelper
|
4
|
+
|
5
|
+
def setup
|
6
|
+
@fixture_path = File.join(File.expand_path(File.dirname(__FILE__)), '../fixtures/')
|
7
|
+
end
|
8
|
+
|
9
|
+
def test_initialize
|
10
|
+
d = Document.create 'http://1stlineleewes.com'
|
11
|
+
assert_equal '1stlineleewes.com', d.uri.host
|
12
|
+
assert_equal '', d.uri.path
|
13
|
+
end
|
14
|
+
|
15
|
+
end
|
16
|
+
|
17
|
+
|
@@ -0,0 +1,38 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
class RDigTest < Test::Unit::TestCase
|
3
|
+
include TestHelper
|
4
|
+
|
5
|
+
def setup
|
6
|
+
RDig.configuration do |cfg|
|
7
|
+
@old_crawler_cfg = cfg.crawler.clone
|
8
|
+
cfg.log_level = :debug
|
9
|
+
cfg.log_file = 'tmp/test.log'
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
def teardown
|
14
|
+
RDig.configuration do |cfg|
|
15
|
+
cfg.crawler = @old_crawler_cfg
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def test_proxy_config
|
20
|
+
RDig.configuration do |cfg|
|
21
|
+
cfg.crawler.http_proxy = 'http://proxy.com:8080'
|
22
|
+
end
|
23
|
+
assert_equal 'http://proxy.com:8080', RDig.open_uri_http_options[:proxy]
|
24
|
+
assert_nil RDig.open_uri_http_options['Authorization']
|
25
|
+
end
|
26
|
+
|
27
|
+
def test_proxy_auth
|
28
|
+
RDig.configuration do |cfg|
|
29
|
+
cfg.crawler.http_proxy = 'http://proxy.com:8080'
|
30
|
+
cfg.crawler.http_proxy_user = 'username'
|
31
|
+
cfg.crawler.http_proxy_pass = 'password'
|
32
|
+
end
|
33
|
+
assert_equal 'http://proxy.com:8080', RDig.open_uri_http_options[:proxy]
|
34
|
+
assert_equal "Basic dXNlcm5hbWU6cGFzc3dvcmQ=\n", RDig.open_uri_http_options['Authorization']
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
|
data/test/unit/searcher_test.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
require 'test_helper'
|
2
|
-
class
|
2
|
+
class UrlFiltersTest < Test::Unit::TestCase
|
3
3
|
include TestHelper, RDig
|
4
4
|
|
5
5
|
def setup
|
@@ -73,24 +73,24 @@ class UrlFilterTest < Test::Unit::TestCase
|
|
73
73
|
def test_fix_relative_uri
|
74
74
|
doc = Document.create('http://test.host/dir/file.html')
|
75
75
|
assert_equal('http://test.host/dir/another.html',
|
76
|
-
UrlFilters.fix_relative_uri(
|
76
|
+
UrlFilters.fix_relative_uri(doc.create_child('another.html')).uri.to_s)
|
77
77
|
assert_equal('http://test.host/dir/../another.html',
|
78
|
-
UrlFilters.fix_relative_uri(
|
78
|
+
UrlFilters.fix_relative_uri(doc.create_child('../another.html')).uri.to_s)
|
79
79
|
assert_equal('http://test.host/dir/another.html',
|
80
|
-
UrlFilters.fix_relative_uri(
|
80
|
+
UrlFilters.fix_relative_uri(doc.create_child('/dir/another.html')).uri.to_s)
|
81
81
|
assert_equal('http://test.host/dir/another.html',
|
82
|
-
UrlFilters.fix_relative_uri(
|
82
|
+
UrlFilters.fix_relative_uri(doc.create_child('http://test.host/dir/another.html')).uri.to_s)
|
83
83
|
assert_equal('HTTP://test.host/dir/another.html',
|
84
|
-
UrlFilters.fix_relative_uri(
|
84
|
+
UrlFilters.fix_relative_uri(doc.create_child('HTTP://test.host/dir/another.html')).uri.to_s)
|
85
85
|
doc = Document.create('https://test.host/dir/')
|
86
86
|
assert_equal('https://test.host/dir/another.html',
|
87
|
-
UrlFilters.fix_relative_uri(
|
87
|
+
UrlFilters.fix_relative_uri(doc.create_child('another.html')).uri.to_s)
|
88
88
|
doc = Document.create('https://test.host/')
|
89
89
|
assert_equal('https://test.host/another.html',
|
90
|
-
UrlFilters.fix_relative_uri(
|
90
|
+
UrlFilters.fix_relative_uri(doc.create_child('another.html')).uri.to_s)
|
91
91
|
doc = Document.create('https://test.host')
|
92
92
|
assert_equal('https://test.host/another.html',
|
93
|
-
UrlFilters.fix_relative_uri(
|
93
|
+
UrlFilters.fix_relative_uri(doc.create_child('another.html')).uri.to_s)
|
94
94
|
end
|
95
95
|
end
|
96
96
|
|
metadata
CHANGED
@@ -1,74 +1,88 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
|
-
rubygems_version: 0.8.11
|
3
|
-
specification_version: 1
|
4
2
|
name: rdig
|
5
3
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.3.
|
7
|
-
date: 2006-12-31 00:00:00 +01:00
|
8
|
-
summary: Ruby based web site indexing and searching library.
|
9
|
-
require_paths:
|
10
|
-
- lib
|
11
|
-
email: jk@jkraemer.net
|
12
|
-
homepage: http://rdig.rubyforge.org/
|
13
|
-
rubyforge_project: rdig
|
14
|
-
description: RDig provides an HTTP crawler and content extraction utilities to help building a site search for web sites or intranets. Internally, Ferret is used for the full text indexing. After creating a config file for your site, the index can be built with a single call to rdig. For HTML page crawling, hpricot and rubyful_soup are supported.
|
15
|
-
autorequire:
|
16
|
-
default_executable: rdig
|
17
|
-
bindir: bin
|
18
|
-
has_rdoc: true
|
19
|
-
required_ruby_version: !ruby/object:Gem::Version::Requirement
|
20
|
-
requirements:
|
21
|
-
- - ">"
|
22
|
-
- !ruby/object:Gem::Version
|
23
|
-
version: 0.0.0
|
24
|
-
version:
|
4
|
+
version: 0.3.5
|
25
5
|
platform: ruby
|
26
|
-
signing_key:
|
27
|
-
cert_chain:
|
28
6
|
authors:
|
29
7
|
- Jens Kraemer
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2008-02-26 00:00:00 +01:00
|
13
|
+
default_executable: rdig
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: ferret
|
17
|
+
version_requirement:
|
18
|
+
version_requirements: !ruby/object:Gem::Requirement
|
19
|
+
requirements:
|
20
|
+
- - ">="
|
21
|
+
- !ruby/object:Gem::Version
|
22
|
+
version: 0.10.0
|
23
|
+
version:
|
24
|
+
- !ruby/object:Gem::Dependency
|
25
|
+
name: hpricot
|
26
|
+
version_requirement:
|
27
|
+
version_requirements: !ruby/object:Gem::Requirement
|
28
|
+
requirements:
|
29
|
+
- - ">="
|
30
|
+
- !ruby/object:Gem::Version
|
31
|
+
version: "0.6"
|
32
|
+
version:
|
33
|
+
description: RDig provides an HTTP crawler and content extraction utilities to help building a site search for web sites or intranets. Internally, Ferret is used for the full text indexing. After creating a config file for your site, the index can be built with a single call to rdig. For HTML page crawling, hpricot and rubyful_soup are supported.
|
34
|
+
email: jk@jkraemer.net
|
35
|
+
executables:
|
36
|
+
- rdig
|
37
|
+
extensions: []
|
38
|
+
|
39
|
+
extra_rdoc_files:
|
40
|
+
- README
|
41
|
+
- CHANGES
|
42
|
+
- LICENSE
|
43
|
+
- TODO
|
30
44
|
files:
|
31
45
|
- bin/rdig
|
32
|
-
- lib/rdig
|
33
|
-
- lib/htmlentities
|
34
46
|
- lib/rdig.rb
|
35
|
-
- lib/rdig
|
36
|
-
- lib/rdig/crawler.rb
|
37
|
-
- lib/rdig/search.rb
|
38
|
-
- lib/rdig/highlight.rb
|
39
|
-
- lib/rdig/index.rb
|
47
|
+
- lib/rdig
|
40
48
|
- lib/rdig/url_filters.rb
|
49
|
+
- lib/rdig/index.rb
|
50
|
+
- lib/rdig/crawler.rb
|
41
51
|
- lib/rdig/content_extractors.rb
|
42
|
-
- lib/rdig/documents.rb
|
43
52
|
- lib/rdig/file.rb
|
44
|
-
- lib/rdig/
|
53
|
+
- lib/rdig/highlight.rb
|
54
|
+
- lib/rdig/documents.rb
|
55
|
+
- lib/rdig/search.rb
|
56
|
+
- lib/rdig/content_extractors
|
45
57
|
- lib/rdig/content_extractors/doc.rb
|
46
58
|
- lib/rdig/content_extractors/hpricot.rb
|
47
59
|
- lib/rdig/content_extractors/pdf.rb
|
48
|
-
- lib/htmlentities
|
60
|
+
- lib/htmlentities
|
61
|
+
- lib/htmlentities/htmlentities.rb
|
49
62
|
- lib/htmlentities/COPYING
|
63
|
+
- lib/htmlentities/CHANGES
|
50
64
|
- lib/htmlentities/README
|
51
|
-
- lib/htmlentities/htmlentities.rb
|
52
|
-
- test/unit
|
53
65
|
- test/fixtures
|
54
|
-
- test/test_helper.rb
|
55
|
-
- test/unit/etag_filter_test.rb
|
56
|
-
- test/unit/url_filters_test.rb
|
57
|
-
- test/unit/searcher_test.rb
|
58
|
-
- test/unit/rubyful_soup_content_extractor_test.rb
|
59
|
-
- test/unit/pdf_content_extractor_test.rb
|
60
|
-
- test/unit/hpricot_content_extractor_test.rb
|
61
|
-
- test/unit/word_content_extractor_test.rb
|
62
|
-
- test/unit/file_document_test.rb
|
63
|
-
- test/unit/crawler_fs_test.rb
|
64
|
-
- test/fixtures/html
|
65
|
-
- test/fixtures/pdf
|
66
66
|
- test/fixtures/word
|
67
|
-
- test/fixtures/
|
68
|
-
- test/fixtures/html
|
67
|
+
- test/fixtures/word/simple.doc
|
68
|
+
- test/fixtures/html
|
69
69
|
- test/fixtures/html/custom_tag_selectors.html
|
70
|
+
- test/fixtures/html/simple.html
|
71
|
+
- test/fixtures/html/entities.html
|
72
|
+
- test/fixtures/pdf
|
70
73
|
- test/fixtures/pdf/simple.pdf
|
71
|
-
- test/
|
74
|
+
- test/unit
|
75
|
+
- test/unit/crawler_fs_test.rb
|
76
|
+
- test/unit/pdf_content_extractor_test.rb
|
77
|
+
- test/unit/word_content_extractor_test.rb
|
78
|
+
- test/unit/rdig_test.rb
|
79
|
+
- test/unit/http_document_test.rb
|
80
|
+
- test/unit/searcher_test.rb
|
81
|
+
- test/unit/file_document_test.rb
|
82
|
+
- test/unit/url_filters_test.rb
|
83
|
+
- test/unit/hpricot_content_extractor_test.rb
|
84
|
+
- test/unit/etag_filter_test.rb
|
85
|
+
- test/test_helper.rb
|
72
86
|
- doc/examples
|
73
87
|
- doc/examples/config.rb
|
74
88
|
- LICENSE
|
@@ -77,41 +91,35 @@ files:
|
|
77
91
|
- README
|
78
92
|
- install.rb
|
79
93
|
- rakefile
|
80
|
-
|
81
|
-
|
94
|
+
has_rdoc: true
|
95
|
+
homepage: http://rdig.rubyforge.org/
|
96
|
+
post_install_message:
|
82
97
|
rdoc_options:
|
83
98
|
- --title
|
84
99
|
- Rake -- Ruby Make
|
85
100
|
- --main
|
86
101
|
- README
|
87
102
|
- --line-numbers
|
88
|
-
|
89
|
-
-
|
90
|
-
|
91
|
-
|
92
|
-
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
103
|
+
require_paths:
|
104
|
+
- lib
|
105
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
106
|
+
requirements:
|
107
|
+
- - ">="
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: "0"
|
110
|
+
version:
|
111
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
112
|
+
requirements:
|
113
|
+
- - ">="
|
114
|
+
- !ruby/object:Gem::Version
|
115
|
+
version: "0"
|
116
|
+
version:
|
97
117
|
requirements: []
|
98
118
|
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
- !ruby/object:Gem::Version
|
107
|
-
version: 0.10.0
|
108
|
-
version:
|
109
|
-
- !ruby/object:Gem::Dependency
|
110
|
-
name: hpricot
|
111
|
-
version_requirement:
|
112
|
-
version_requirements: !ruby/object:Gem::Version::Requirement
|
113
|
-
requirements:
|
114
|
-
- - ">="
|
115
|
-
- !ruby/object:Gem::Version
|
116
|
-
version: "0.4"
|
117
|
-
version:
|
119
|
+
rubyforge_project: rdig
|
120
|
+
rubygems_version: 1.0.1
|
121
|
+
signing_key:
|
122
|
+
specification_version: 2
|
123
|
+
summary: Ruby based web site indexing and searching library.
|
124
|
+
test_files: []
|
125
|
+
|
@@ -1,151 +0,0 @@
|
|
1
|
-
begin
|
2
|
-
require 'rubyful_soup'
|
3
|
-
rescue LoadError
|
4
|
-
require 'rubygems'
|
5
|
-
require 'rubyful_soup' rescue nil
|
6
|
-
end
|
7
|
-
|
8
|
-
if defined?(BeautifulSoup)
|
9
|
-
|
10
|
-
# override some methods concered with entity resolving
|
11
|
-
# to convert them to strings
|
12
|
-
class BeautifulStoneSoup
|
13
|
-
# resolve unknown html entities using the htmlentities lib
|
14
|
-
alias :orig_unknown_entityref :unknown_entityref
|
15
|
-
def unknown_entityref(ref)
|
16
|
-
if HTMLEntities::MAP.has_key?(ref)
|
17
|
-
handle_data [HTMLEntities::MAP[ref]].pack('U')
|
18
|
-
else
|
19
|
-
orig_unknown_entityref ref
|
20
|
-
end
|
21
|
-
end
|
22
|
-
|
23
|
-
# resolve numeric entities to utf8
|
24
|
-
def handle_charref(ref)
|
25
|
-
handle_data( ref.gsub(/([0-9]{1,7})/) {
|
26
|
-
[$1.to_i].pack('U')
|
27
|
-
}.gsub(/x([0-9a-f]{1,6})/i) {
|
28
|
-
[$1.to_i(16)].pack('U')
|
29
|
-
} )
|
30
|
-
end
|
31
|
-
end
|
32
|
-
|
33
|
-
module RDig
|
34
|
-
module ContentExtractors
|
35
|
-
|
36
|
-
# extracts title, content and links from html documents
|
37
|
-
class RubyfulSoupContentExtractor < ContentExtractor
|
38
|
-
|
39
|
-
def initialize(config)
|
40
|
-
super(config.rubyful_soup)
|
41
|
-
# if not configured, refuse to handle any content:
|
42
|
-
@pattern = /^(text\/(html|xml)|application\/(xhtml\+xml|xml))/ if config.rubyful_soup
|
43
|
-
end
|
44
|
-
|
45
|
-
# returns:
|
46
|
-
# { :content => 'extracted clear text',
|
47
|
-
# :meta => { :title => 'Title' },
|
48
|
-
# :links => [array of urls] }
|
49
|
-
def process(content)
|
50
|
-
result = { }
|
51
|
-
tag_soup = BeautifulSoup.new(content)
|
52
|
-
result[:title] = extract_title(tag_soup)
|
53
|
-
result[:links] = extract_links(tag_soup)
|
54
|
-
result[:content] = extract_content(tag_soup)
|
55
|
-
return result
|
56
|
-
end
|
57
|
-
|
58
|
-
# Extracts textual content from the HTML tree.
|
59
|
-
#
|
60
|
-
# - First, the root element to use is determined using the
|
61
|
-
# +content_element+ method, which itself uses the content_tag_selector
|
62
|
-
# from RDig.configuration.
|
63
|
-
# - Then, this element is processed by +extract_text+, which will give
|
64
|
-
# all textual content contained in the root element and all it's
|
65
|
-
# children.
|
66
|
-
def extract_content(tag_soup)
|
67
|
-
content = ''
|
68
|
-
ce = content_element(tag_soup)
|
69
|
-
ce.children { |child|
|
70
|
-
extract_text(child, content)
|
71
|
-
} unless ce.nil?
|
72
|
-
return content.strip
|
73
|
-
end
|
74
|
-
|
75
|
-
# extracts the href attributes of all a tags, except
|
76
|
-
# internal links like <a href="#top">
|
77
|
-
def extract_links(tagsoup)
|
78
|
-
tagsoup.find_all('a').map { |link|
|
79
|
-
CGI.unescapeHTML(link['href']) if (link['href'] && link['href'] !~ /^#/)
|
80
|
-
}.compact
|
81
|
-
end
|
82
|
-
|
83
|
-
# Extracts the title from the given html tree
|
84
|
-
def extract_title(tagsoup)
|
85
|
-
the_title_tag = title_tag(tagsoup)
|
86
|
-
if the_title_tag.is_a? String
|
87
|
-
the_title_tag
|
88
|
-
else
|
89
|
-
title = ''
|
90
|
-
extract_text(the_title_tag, title)
|
91
|
-
title.strip
|
92
|
-
end
|
93
|
-
end
|
94
|
-
|
95
|
-
# Recursively extracts all text contained in the given element,
|
96
|
-
# and appends it to content.
|
97
|
-
def extract_text(element, content='')
|
98
|
-
return nil if element.nil?
|
99
|
-
if element.is_a? NavigableString
|
100
|
-
value = strip_comments(element)
|
101
|
-
value.strip!
|
102
|
-
unless value.empty?
|
103
|
-
content << value
|
104
|
-
content << ' '
|
105
|
-
end
|
106
|
-
elsif element.string # it's a Tag, and it has some content string
|
107
|
-
# skip inline scripts and styles
|
108
|
-
return nil if element.name =~ /^(script|style)$/i
|
109
|
-
value = element.string.strip
|
110
|
-
unless value.empty?
|
111
|
-
content << value
|
112
|
-
content << ' '
|
113
|
-
end
|
114
|
-
else
|
115
|
-
element.children { |child|
|
116
|
-
extract_text(child, content)
|
117
|
-
}
|
118
|
-
end
|
119
|
-
end
|
120
|
-
|
121
|
-
# Returns the element to extract the title from.
|
122
|
-
#
|
123
|
-
# This may return a string, e.g. an attribute value selected from a meta
|
124
|
-
# tag, too.
|
125
|
-
def title_tag(tagsoup)
|
126
|
-
if @config.title_tag_selector
|
127
|
-
@config.title_tag_selector.call(tagsoup)
|
128
|
-
else
|
129
|
-
tagsoup.html.head.title
|
130
|
-
end
|
131
|
-
end
|
132
|
-
|
133
|
-
# Retrieve the root element to extract document content from
|
134
|
-
def content_element(tagsoup)
|
135
|
-
if @config.content_tag_selector
|
136
|
-
@config.content_tag_selector.call(tagsoup)
|
137
|
-
else
|
138
|
-
tagsoup.html.body
|
139
|
-
end
|
140
|
-
end
|
141
|
-
|
142
|
-
# Return the given string minus all html comments
|
143
|
-
def strip_comments(string)
|
144
|
-
string.gsub(Regexp.new('<!--.*?-->', Regexp::MULTILINE, 'u'), '')
|
145
|
-
end
|
146
|
-
end
|
147
|
-
|
148
|
-
end
|
149
|
-
end
|
150
|
-
|
151
|
-
end
|
@@ -1,83 +0,0 @@
|
|
1
|
-
require 'test_helper'
|
2
|
-
class RubyfulSoupContentExtractorTest < Test::Unit::TestCase
|
3
|
-
include TestHelper
|
4
|
-
|
5
|
-
def setup
|
6
|
-
@config = OpenStruct.new(
|
7
|
-
:content_tag_selector => lambda { |tagsoup|
|
8
|
-
tagsoup.html.body
|
9
|
-
},
|
10
|
-
:title_tag_selector => lambda { |tagsoup|
|
11
|
-
tagsoup.html.head.title
|
12
|
-
})
|
13
|
-
@extractor = ContentExtractors::RubyfulSoupContentExtractor.new(OpenStruct.new(:rubyful_soup => @config))
|
14
|
-
@nbsp = [160].pack('U') # non breaking space
|
15
|
-
end
|
16
|
-
|
17
|
-
def test_can_do
|
18
|
-
assert !@extractor.can_do('application/pdf')
|
19
|
-
assert !@extractor.can_do('application/msword')
|
20
|
-
assert @extractor.can_do('text/html')
|
21
|
-
assert @extractor.can_do('text/xml')
|
22
|
-
assert @extractor.can_do('application/xml')
|
23
|
-
assert @extractor.can_do('application/xhtml+xml')
|
24
|
-
end
|
25
|
-
|
26
|
-
def test_simple
|
27
|
-
result = ContentExtractors.process(html_doc('simple'), 'text/html')
|
28
|
-
assert_not_nil result
|
29
|
-
assert_equal 'Sample Title', result[:title]
|
30
|
-
assert_not_nil result[:content]
|
31
|
-
assert_not_nil result[:links]
|
32
|
-
assert_equal 1, result[:links].size
|
33
|
-
assert_equal 'A Link Affe Some sample text Lorem ipsum', result[:content]
|
34
|
-
assert_equal 'http://test.host/affe.html', result[:links].first
|
35
|
-
end
|
36
|
-
|
37
|
-
def test_entities
|
38
|
-
result = @extractor.process(html_doc('entities'))
|
39
|
-
assert_equal 'Sample & Title', result[:title]
|
40
|
-
assert_equal 'http://test.host/affe.html?b=a&c=d', result[:links].first
|
41
|
-
assert_equal 'http://test.host/affe2.html?b=a&c=d', result[:links].last
|
42
|
-
assert_equal "Some > Links don't#{@nbsp}break me! Affe Affe Ümläuts heiß hier ß", result[:content]
|
43
|
-
end
|
44
|
-
|
45
|
-
def test_custom_content_element
|
46
|
-
@config.title_tag_selector = lambda do |tagsoup|
|
47
|
-
tagsoup.find('h1', :attrs => { 'class', 'title' })
|
48
|
-
end
|
49
|
-
@config.content_tag_selector = lambda do |tagsoup|
|
50
|
-
tagsoup.find('div', :attrs => { 'id', 'content' })
|
51
|
-
end
|
52
|
-
result = @extractor.process(html_doc('custom_tag_selectors'))
|
53
|
-
assert_equal 'Sample Title in h1', result[:title]
|
54
|
-
assert_equal 'Affe Real content is here.', result[:content]
|
55
|
-
# check if links are collected outside the content tag, too:
|
56
|
-
assert_equal 3, result[:links].size
|
57
|
-
assert_equal 'http://test.host/outside.html', result[:links].first
|
58
|
-
assert_equal '/inside.html', result[:links][1]
|
59
|
-
assert_equal '/footer.html', result[:links][2]
|
60
|
-
end
|
61
|
-
|
62
|
-
|
63
|
-
def test_title_from_dcmeta
|
64
|
-
@config.title_tag_selector = lambda do |tagsoup|
|
65
|
-
tagsoup.find('meta', :attrs => { 'name', 'DC.title' })['content']
|
66
|
-
end
|
67
|
-
result = @extractor.process(html_doc('custom_tag_selectors'))
|
68
|
-
assert_equal 'Title from DC meta data', result[:title]
|
69
|
-
end
|
70
|
-
|
71
|
-
def test_preprocessed_title
|
72
|
-
@config.title_tag_selector = lambda do |tagsoup|
|
73
|
-
title = tagsoup.find('meta', :attrs => { 'name', 'DC.title' })['content']
|
74
|
-
# use only a portion of the title tag's contents if it matches our
|
75
|
-
# regexp:
|
76
|
-
title =~ /^(.*)meta data$/ ? $1.strip : title.strip
|
77
|
-
end
|
78
|
-
result = @extractor.process(html_doc('custom_tag_selectors'))
|
79
|
-
assert_equal 'Title from DC', result[:title]
|
80
|
-
end
|
81
|
-
|
82
|
-
end
|
83
|
-
|