rdig 0.3.4 → 0.3.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGES +8 -0
- data/doc/examples/config.rb +25 -25
- data/lib/rdig.rb +42 -17
- data/lib/rdig/content_extractors.rb +3 -3
- data/lib/rdig/content_extractors/hpricot.rb +7 -7
- data/lib/rdig/crawler.rb +31 -19
- data/lib/rdig/documents.rb +32 -25
- data/lib/rdig/index.rb +1 -1
- data/lib/rdig/search.rb +1 -1
- data/lib/rdig/url_filters.rb +14 -2
- data/rakefile +7 -9
- data/test/unit/http_document_test.rb +17 -0
- data/test/unit/rdig_test.rb +38 -0
- data/test/unit/searcher_test.rb +2 -0
- data/test/unit/url_filters_test.rb +9 -9
- metadata +88 -80
- data/lib/rdig/content_extractors/rubyful_soup.rb +0 -151
- data/test/unit/rubyful_soup_content_extractor_test.rb +0 -83
data/CHANGES
CHANGED
@@ -1,3 +1,11 @@
|
|
1
|
+
0.3.5
|
2
|
+
- Add max_depth option to crawler configuration for limiting the crawl to a
|
3
|
+
specific depth
|
4
|
+
- add support for http proxies including basic authentication
|
5
|
+
- remove rubyfoul_soup support
|
6
|
+
|
7
|
+
0.3.4
|
8
|
+
|
1
9
|
0.3.2
|
2
10
|
- make RDig compatible with Ferret 0.10.x
|
3
11
|
- won't work any more with Ferret 0.9.x and before
|
data/doc/examples/config.rb
CHANGED
@@ -2,6 +2,12 @@ RDig.configuration do |cfg|
|
|
2
2
|
|
3
3
|
##################################################################
|
4
4
|
# options you really should set
|
5
|
+
|
6
|
+
# log file location
|
7
|
+
cfg.log_file = '/tmp/rdig.log'
|
8
|
+
|
9
|
+
# log level, set to :debug, :info, :warn or :error
|
10
|
+
cfg.log_level = :info
|
5
11
|
|
6
12
|
# provide one or more URLs for the crawler to start from
|
7
13
|
cfg.crawler.start_urls = [ 'http://www.example.com/' ]
|
@@ -29,10 +35,11 @@ RDig.configuration do |cfg|
|
|
29
35
|
# content extraction options
|
30
36
|
cfg.content_extraction = OpenStruct.new(
|
31
37
|
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
38
|
+
# HPRICOT configuration
|
39
|
+
# hpricot is the html parsing lib used by RDig. See
|
40
|
+
# http://code.whytheluckystiff.net/hpricot for usage information.
|
41
|
+
# Any code blocks given for content selection will receive an Hpricot instance
|
42
|
+
# containing the full page content when called.
|
36
43
|
:hpricot => OpenStruct.new(
|
37
44
|
# css selector for the element containing the page title
|
38
45
|
:title_tag_selector => 'title',
|
@@ -42,26 +49,6 @@ RDig.configuration do |cfg|
|
|
42
49
|
# might also be a proc returning either an element or a string:
|
43
50
|
# :content_tag_selector => lambda { |hpricot_doc| ... }
|
44
51
|
)
|
45
|
-
|
46
|
-
# RUBYFUL SOUP
|
47
|
-
# This is a powerful, but somewhat slow, ruby-only html parsing lib which was
|
48
|
-
# RDig's default html parser up to version 0.3.2. To use it, comment the
|
49
|
-
# hpricot config above, and uncomment the following:
|
50
|
-
#
|
51
|
-
# :rubyful_soup => OpenStruct.new(
|
52
|
-
# # provide a method that returns the title of an html document
|
53
|
-
# # this method may either return a tag to extract the title from,
|
54
|
-
# # or a ready-to-index string.
|
55
|
-
# :content_tag_selector => lambda { |tagsoup|
|
56
|
-
# tagsoup.html.body
|
57
|
-
# },
|
58
|
-
# # provide a method that selects the tag containing the page content you
|
59
|
-
# # want to index. Useful to avoid indexing common elements like navigation
|
60
|
-
# # and page footers for every page.
|
61
|
-
# :title_tag_selector => lambda { |tagsoup|
|
62
|
-
# tagsoup.html.head.title
|
63
|
-
# }
|
64
|
-
# )
|
65
52
|
)
|
66
53
|
|
67
54
|
# crawler options
|
@@ -95,12 +82,25 @@ RDig.configuration do |cfg|
|
|
95
82
|
# crawls on slow sites. Don't set to 0, even when crawling a local fs.
|
96
83
|
# cfg.crawler.wait_before_leave = 10
|
97
84
|
|
85
|
+
# limit the crawling depth. Default: nil (unlimited)
|
86
|
+
# Set to 0 to only index the start_urls.
|
87
|
+
# cfg.crawler.max_depth = nil
|
88
|
+
|
89
|
+
# http proxy configuration
|
90
|
+
# proxy url
|
91
|
+
# cfg.crawler.http_proxy = nil
|
92
|
+
#
|
93
|
+
# proxy username
|
94
|
+
# cfg.crawler.http_proxy_user = nil
|
95
|
+
# proxy password
|
96
|
+
# cfg.crawler.http_proxy_pass = nil
|
97
|
+
|
98
98
|
# indexer options
|
99
99
|
|
100
100
|
# create a new index on each run. Will append to the index if false. Use when
|
101
101
|
# building a single index from multiple runs, e.g. one across a website and the
|
102
102
|
# other a tree in a local file system
|
103
|
-
#
|
103
|
+
# cfg.index.create = true
|
104
104
|
|
105
105
|
# rewrite document uris before indexing them. This is useful if you're
|
106
106
|
# indexing on disk, but the documents should be accessible via http, e.g. from
|
data/lib/rdig.rb
CHANGED
@@ -24,7 +24,7 @@
|
|
24
24
|
#++
|
25
25
|
#
|
26
26
|
|
27
|
-
RDIGVERSION = '0.3.
|
27
|
+
RDIGVERSION = '0.3.5'
|
28
28
|
|
29
29
|
|
30
30
|
require 'thread'
|
@@ -39,6 +39,8 @@ require 'net/http'
|
|
39
39
|
require 'getoptlong'
|
40
40
|
require 'tempfile'
|
41
41
|
require 'open-uri'
|
42
|
+
require 'logger'
|
43
|
+
require 'base64'
|
42
44
|
|
43
45
|
begin
|
44
46
|
require 'ferret'
|
@@ -69,10 +71,11 @@ module RDig
|
|
69
71
|
:scheme_filter_http,
|
70
72
|
:fix_relative_uri,
|
71
73
|
:normalize_uri,
|
74
|
+
{ RDig::UrlFilters::DepthFilter => :max_depth },
|
72
75
|
{ :hostname_filter => :include_hosts },
|
73
76
|
{ RDig::UrlFilters::UrlInclusionFilter => :include_documents },
|
74
77
|
{ RDig::UrlFilters::UrlExclusionFilter => :exclude_documents },
|
75
|
-
RDig::UrlFilters::VisitedUrlFilter
|
78
|
+
RDig::UrlFilters::VisitedUrlFilter
|
76
79
|
],
|
77
80
|
# filter chain for file system crawling
|
78
81
|
:file => [
|
@@ -103,6 +106,8 @@ module RDig
|
|
103
106
|
yield configuration
|
104
107
|
else
|
105
108
|
@config ||= OpenStruct.new(
|
109
|
+
:log_file => '/tmp/rdig.log',
|
110
|
+
:log_level => :warn,
|
106
111
|
:crawler => OpenStruct.new(
|
107
112
|
:start_urls => [ "http://localhost:3000/" ],
|
108
113
|
:include_hosts => [ "localhost" ],
|
@@ -111,7 +116,11 @@ module RDig
|
|
111
116
|
:index_document => nil,
|
112
117
|
:num_threads => 2,
|
113
118
|
:max_redirects => 5,
|
114
|
-
:
|
119
|
+
:max_depth => nil,
|
120
|
+
:wait_before_leave => 10,
|
121
|
+
:http_proxy => nil,
|
122
|
+
:http_proxy_user => nil,
|
123
|
+
:http_proxy_pass => nil
|
115
124
|
),
|
116
125
|
:content_extraction => OpenStruct.new(
|
117
126
|
# settings for html content extraction (hpricot)
|
@@ -124,19 +133,6 @@ module RDig
|
|
124
133
|
# might also be a proc returning either an element or a string:
|
125
134
|
# :content_tag_selector => lambda { |hpricot_doc| ... }
|
126
135
|
)
|
127
|
-
#,
|
128
|
-
# # settings for html content extraction (RubyfulSoup)
|
129
|
-
# :rubyful_soup => OpenStruct.new(
|
130
|
-
# # select the html element that contains the content to index
|
131
|
-
# # by default, we index all inside the body tag:
|
132
|
-
# :content_tag_selector => lambda { |tagsoup|
|
133
|
-
# tagsoup.html.body
|
134
|
-
# },
|
135
|
-
# # select the html element containing the title
|
136
|
-
# :title_tag_selector => lambda { |tagsoup|
|
137
|
-
# tagsoup.html.head.title
|
138
|
-
# }
|
139
|
-
# )
|
140
136
|
),
|
141
137
|
:index => OpenStruct.new(
|
142
138
|
:path => "index/",
|
@@ -151,6 +147,36 @@ module RDig
|
|
151
147
|
end
|
152
148
|
alias config configuration
|
153
149
|
|
150
|
+
def logger
|
151
|
+
@logger ||= create_logger
|
152
|
+
end
|
153
|
+
|
154
|
+
def logger=(log)
|
155
|
+
@logger = log
|
156
|
+
end
|
157
|
+
|
158
|
+
def create_logger
|
159
|
+
l = Logger.new(RDig.config.log_file)
|
160
|
+
l.level = Logger.const_get RDig.config.log_level.to_s.upcase rescue Logger::WARN
|
161
|
+
return l
|
162
|
+
end
|
163
|
+
|
164
|
+
# returns http options for open_uri if configured
|
165
|
+
def open_uri_http_options
|
166
|
+
unless RDig::configuration.crawler.open_uri_http_options
|
167
|
+
opts = {}
|
168
|
+
if RDig::configuration.crawler.http_proxy
|
169
|
+
opts[:proxy] = RDig::configuration.crawler.http_proxy
|
170
|
+
if user = RDig::configuration.crawler.http_proxy_user
|
171
|
+
pass = RDig::configuration.crawler.http_proxy_pass
|
172
|
+
opts['Authorization'] = "Basic " + Base64.encode64("#{user}:#{pass}")
|
173
|
+
end
|
174
|
+
end
|
175
|
+
RDig::configuration.crawler.open_uri_http_options = opts
|
176
|
+
end
|
177
|
+
return RDig::configuration.crawler.open_uri_http_options
|
178
|
+
end
|
179
|
+
|
154
180
|
end
|
155
181
|
|
156
182
|
class Application
|
@@ -210,7 +236,6 @@ module RDig
|
|
210
236
|
when '--query'
|
211
237
|
options.query = value
|
212
238
|
when '--version'
|
213
|
-
puts "rdig, version #{RDIGVERSION}"
|
214
239
|
exit
|
215
240
|
else
|
216
241
|
fail "Unknown option: #{opt}"
|
@@ -22,7 +22,7 @@ module RDig
|
|
22
22
|
def self.extractors; @@extractors ||= [] end
|
23
23
|
def self.extractor_instances
|
24
24
|
@@extractor_instances ||= extractors.map { |ex_class|
|
25
|
-
|
25
|
+
RDig.logger.info "initializing content extractor: #{ex_class}"
|
26
26
|
ex_class.new(RDig.configuration.content_extraction) rescue nil
|
27
27
|
}.compact
|
28
28
|
end
|
@@ -77,8 +77,8 @@ end
|
|
77
77
|
# load content extractors
|
78
78
|
Dir["#{File.expand_path(File.dirname(__FILE__))}/content_extractors/**/*.rb"].each do |f|
|
79
79
|
begin
|
80
|
-
require f
|
80
|
+
require f
|
81
81
|
rescue LoadError
|
82
|
-
|
82
|
+
RDig::logger.error "could not load #{f}: #{$!}"
|
83
83
|
end
|
84
84
|
end
|
@@ -40,12 +40,11 @@ module RDig
|
|
40
40
|
# all textual content contained in the root element and all it's
|
41
41
|
# children.
|
42
42
|
def extract_content(doc)
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
#
|
47
|
-
|
48
|
-
return content.strip
|
43
|
+
if ce = content_element(doc)
|
44
|
+
return strip_tags(strip_comments(ce.inner_html))
|
45
|
+
end
|
46
|
+
# return (ce.inner_text || '').gsub(Regexp.new('\s+', Regexp::MULTILINE, 'u'), ' ').strip
|
47
|
+
return ''
|
49
48
|
end
|
50
49
|
|
51
50
|
# extracts the href attributes of all a tags, except
|
@@ -91,7 +90,8 @@ module RDig
|
|
91
90
|
Regexp::MULTILINE, 'u'), ''
|
92
91
|
string.gsub! Regexp.new('<.+?>',
|
93
92
|
Regexp::MULTILINE, 'u'), ''
|
94
|
-
string.gsub Regexp.new('\s+', Regexp::MULTILINE, 'u'), ' '
|
93
|
+
string.gsub! Regexp.new('\s+', Regexp::MULTILINE, 'u'), ' '
|
94
|
+
string.strip
|
95
95
|
end
|
96
96
|
|
97
97
|
end
|
data/lib/rdig/crawler.rb
CHANGED
@@ -3,23 +3,30 @@ module RDig
|
|
3
3
|
|
4
4
|
class Crawler
|
5
5
|
|
6
|
-
def initialize
|
6
|
+
def initialize(config = RDig.config, logger = RDig.logger)
|
7
7
|
@documents = Queue.new
|
8
8
|
@etag_filter = ETagFilter.new
|
9
|
+
@logger = logger
|
10
|
+
@config = config
|
9
11
|
end
|
10
12
|
|
11
13
|
def run
|
12
|
-
|
13
|
-
|
14
|
-
|
14
|
+
@indexer = Index::Indexer.new(@config.index)
|
15
|
+
crawl
|
16
|
+
ensure
|
17
|
+
@indexer.close if @indexer
|
18
|
+
end
|
19
|
+
|
20
|
+
def crawl
|
21
|
+
raise 'no start urls given!' if @config.crawler.start_urls.empty?
|
15
22
|
# check whether we are indexing on-disk or via http
|
16
|
-
url_type =
|
23
|
+
url_type = @config.crawler.start_urls.first =~ /^file:\/\// ? :file : :http
|
17
24
|
chain_config = RDig.filter_chain[url_type]
|
18
25
|
|
19
26
|
filterchain = UrlFilters::FilterChain.new(chain_config)
|
20
|
-
|
21
|
-
|
22
|
-
num_threads =
|
27
|
+
@config.crawler.start_urls.each { |url| add_url(url, filterchain) }
|
28
|
+
|
29
|
+
num_threads = @config.crawler.num_threads
|
23
30
|
group = ThreadsWait.new
|
24
31
|
num_threads.times { |i|
|
25
32
|
group.join_nowait Thread.new("fetcher #{i}") {
|
@@ -31,20 +38,19 @@ module RDig
|
|
31
38
|
}
|
32
39
|
|
33
40
|
# check for an empty queue every now and then
|
34
|
-
sleep_interval =
|
41
|
+
sleep_interval = @config.crawler.wait_before_leave
|
35
42
|
begin
|
36
43
|
sleep sleep_interval
|
37
44
|
end until @documents.empty?
|
38
45
|
# nothing to do any more, tell the threads to exit
|
39
46
|
num_threads.times { @documents << :exit }
|
40
47
|
|
41
|
-
|
48
|
+
@logger.info "waiting for threads to finish..."
|
42
49
|
group.all_waits
|
43
|
-
ensure
|
44
|
-
@indexer.close if @indexer
|
45
50
|
end
|
46
51
|
|
47
52
|
def process_document(doc, filterchain)
|
53
|
+
@logger.debug "processing document #{doc}"
|
48
54
|
doc.fetch
|
49
55
|
# add links from this document to the queue
|
50
56
|
doc.content[:links].each { |url|
|
@@ -52,10 +58,14 @@ module RDig
|
|
52
58
|
} unless doc.content[:links].nil?
|
53
59
|
|
54
60
|
return unless @etag_filter.apply(doc)
|
55
|
-
|
61
|
+
add_to_index doc
|
56
62
|
rescue
|
57
|
-
|
58
|
-
|
63
|
+
@logger.error "error processing document #{doc.uri.to_s}: #{$!}"
|
64
|
+
@logger.debug "Trace: #{$!.backtrace.join("\n")}"
|
65
|
+
end
|
66
|
+
|
67
|
+
def add_to_index(doc)
|
68
|
+
@indexer << doc if doc.needs_indexing?
|
59
69
|
end
|
60
70
|
|
61
71
|
|
@@ -64,17 +74,19 @@ module RDig
|
|
64
74
|
# processing
|
65
75
|
def add_url(url, filterchain, referring_document = nil)
|
66
76
|
return if url.nil? || url.empty?
|
67
|
-
|
68
|
-
|
77
|
+
|
78
|
+
@logger.debug "add_url #{url}"
|
79
|
+
doc = if referring_document
|
80
|
+
referring_document.create_child(url)
|
69
81
|
else
|
70
|
-
|
82
|
+
Document.create(url)
|
71
83
|
end
|
72
84
|
|
73
85
|
doc = filterchain.apply(doc)
|
74
86
|
|
75
87
|
if doc
|
76
88
|
@documents << doc
|
77
|
-
|
89
|
+
@logger.debug "url #{url} survived filterchain"
|
78
90
|
end
|
79
91
|
rescue
|
80
92
|
nil
|
data/lib/rdig/documents.rb
CHANGED
@@ -9,30 +9,23 @@ module RDig
|
|
9
9
|
attr_reader :content
|
10
10
|
attr_reader :content_type
|
11
11
|
|
12
|
-
def self.create(url
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
case url
|
19
|
-
when /^https?:\/\//i
|
20
|
-
HttpDocument.new(:url => url, :referrer => referrer_uri) if referrer_uri.nil?
|
21
|
-
when /^file:\/\//i
|
22
|
-
# files don't have referrers - the check for nil prevents us from being
|
23
|
-
# tricked into indexing local files by file:// links in the web site
|
24
|
-
# we index.
|
25
|
-
FileDocument.new(:url => url) if referrer_uri.nil?
|
12
|
+
def self.create(url)
|
13
|
+
return case url
|
14
|
+
when /^https?:\/\//i
|
15
|
+
HttpDocument.new(:uri => url)
|
16
|
+
when /^file:\/\//i
|
17
|
+
FileDocument.new(:uri => url)
|
26
18
|
end
|
27
19
|
end
|
28
20
|
|
29
21
|
# url: url of this document, may be relative to the referring doc or host.
|
30
22
|
# referrer: uri of the document we retrieved this link from
|
31
23
|
def initialize(args)
|
24
|
+
RDig.logger.debug "initialize: #{args.inspect}"
|
32
25
|
begin
|
33
|
-
@uri = URI.parse(args[:
|
26
|
+
@uri = URI.parse(args[:uri])
|
34
27
|
rescue URI::InvalidURIError
|
35
|
-
raise "Cannot create document using invalid URL: #{args[:
|
28
|
+
raise "Cannot create document using invalid URL: #{args[:uri]}"
|
36
29
|
end
|
37
30
|
end
|
38
31
|
|
@@ -48,6 +41,10 @@ module RDig
|
|
48
41
|
!self.content.nil?
|
49
42
|
end
|
50
43
|
|
44
|
+
def to_s
|
45
|
+
"#{self.class.name}, uri=#{uri}, title=#{has_content? ? title : 'not loaded yet'}"
|
46
|
+
end
|
47
|
+
|
51
48
|
end
|
52
49
|
|
53
50
|
|
@@ -59,14 +56,17 @@ module RDig
|
|
59
56
|
super(args)
|
60
57
|
end
|
61
58
|
|
59
|
+
def create_child(uri)
|
60
|
+
FileDocument.new(:uri => uri)
|
61
|
+
end
|
62
|
+
|
62
63
|
def self.find_files(path)
|
63
64
|
links = []
|
65
|
+
pattern = /.+\.(#{File::FILE_EXTENSION_MIME_TYPES.keys.join('|')})$/i
|
64
66
|
Dir.glob(File.expand_path(File.join(path, '*'))) do |filename|
|
67
|
+
RDig.logger.debug "checking file #{filename}"
|
65
68
|
# Skip files not matching known mime types
|
66
|
-
|
67
|
-
if File.directory?(filename) || filename =~ pattern
|
68
|
-
links << "file://#{filename}"
|
69
|
-
end
|
69
|
+
links << "file://#{filename}" if File.directory?(filename) || filename =~ pattern
|
70
70
|
end
|
71
71
|
links
|
72
72
|
end
|
@@ -97,20 +97,27 @@ module RDig
|
|
97
97
|
#
|
98
98
|
class HttpDocument < Document
|
99
99
|
|
100
|
+
# counts how far this document is away from one of the start urls. Used to limit crawling by depth.
|
101
|
+
attr_reader :depth
|
100
102
|
attr_reader :referring_uri
|
101
103
|
attr_reader :status
|
102
104
|
attr_reader :etag
|
105
|
+
|
106
|
+
def create_child(uri)
|
107
|
+
HttpDocument.new(:uri => uri, :referrer => self.uri, :depth => self.depth+1) unless uri =~ /^file:\/\//i
|
108
|
+
end
|
103
109
|
|
104
110
|
# url: url of this document, may be relative to the referring doc or host.
|
105
111
|
# referrer: uri of the document we retrieved this link from
|
106
112
|
def initialize(args={})
|
107
113
|
super(args)
|
108
114
|
@referring_uri = args[:referrer]
|
115
|
+
@depth = args[:depth] || 0
|
109
116
|
end
|
110
117
|
|
111
118
|
def fetch
|
112
|
-
|
113
|
-
open(@uri.to_s) do |doc|
|
119
|
+
RDig.logger.debug "fetching #{@uri.to_s}"
|
120
|
+
open(@uri.to_s, RDig::open_uri_http_options) do |doc|
|
114
121
|
case doc.status.first.to_i
|
115
122
|
when 200
|
116
123
|
@etag = doc.meta['etag']
|
@@ -118,13 +125,13 @@ module RDig
|
|
118
125
|
@content = ContentExtractors.process(doc.read, doc.content_type)
|
119
126
|
@status = :success
|
120
127
|
when 404
|
121
|
-
|
128
|
+
RDig.logger.info "got 404 for #{@uri}"
|
122
129
|
else
|
123
|
-
|
130
|
+
RDig.logger.info "don't know what to do with response: #{doc.status.join(' : ')}"
|
124
131
|
end
|
125
132
|
end
|
126
133
|
rescue
|
127
|
-
|
134
|
+
RDig.logger.warn "error fetching #{@uri.to_s}: #{$!}"
|
128
135
|
ensure
|
129
136
|
@content ||= {}
|
130
137
|
end
|
data/lib/rdig/index.rb
CHANGED
@@ -15,7 +15,7 @@ module RDig
|
|
15
15
|
end
|
16
16
|
|
17
17
|
def add_to_index(document)
|
18
|
-
|
18
|
+
RDig.logger.debug "add to index: #{document.uri.to_s}"
|
19
19
|
@config.rewrite_uri.call(document.uri) if @config.rewrite_uri
|
20
20
|
# all stored and tokenized, should be ferret defaults
|
21
21
|
doc = {
|
data/lib/rdig/search.rb
CHANGED
@@ -43,7 +43,7 @@ module RDig
|
|
43
43
|
def search(query, options={})
|
44
44
|
result = {}
|
45
45
|
query = query_parser.parse(query) if query.is_a?(String)
|
46
|
-
|
46
|
+
RDig.logger.info "Query: #{query}"
|
47
47
|
results = []
|
48
48
|
searcher = ferret_searcher
|
49
49
|
result[:hitcount] = searcher.search_each(query, options) do |doc_id, score|
|
data/lib/rdig/url_filters.rb
CHANGED
@@ -80,6 +80,15 @@ module RDig
|
|
80
80
|
end
|
81
81
|
end
|
82
82
|
|
83
|
+
class DepthFilter
|
84
|
+
def initialize(max_depth = nil)
|
85
|
+
@max_depth = max_depth
|
86
|
+
end
|
87
|
+
def apply(document)
|
88
|
+
return document if @max_depth.nil? || document.depth <= @max_depth
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
83
92
|
|
84
93
|
# base class for url inclusion / exclusion filters
|
85
94
|
class PatternFilter
|
@@ -98,6 +107,7 @@ module RDig
|
|
98
107
|
end
|
99
108
|
end
|
100
109
|
end
|
110
|
+
|
101
111
|
class UrlExclusionFilter < PatternFilter
|
102
112
|
# returns nil if any of the patterns matches it's URI,
|
103
113
|
# the document itself otherwise
|
@@ -176,9 +186,11 @@ module RDig
|
|
176
186
|
p document.uri
|
177
187
|
end
|
178
188
|
|
189
|
+
# filter uris by hostname list. With a nil or empty list all documents may
|
190
|
+
# pass this filter.
|
179
191
|
def UrlFilters.hostname_filter(document, include_hosts)
|
180
|
-
|
181
|
-
return nil
|
192
|
+
#RDig.logger.debug "hostname_filter: #{include_hosts}"
|
193
|
+
return document if include_hosts.nil? || include_hosts.empty? || include_hosts.include?(document.uri.host)
|
182
194
|
end
|
183
195
|
|
184
196
|
def UrlFilters.normalize_uri(document)
|
data/rakefile
CHANGED
@@ -21,7 +21,7 @@ end
|
|
21
21
|
PKG_NAME = 'rdig'
|
22
22
|
|
23
23
|
# Determine the current version of the software
|
24
|
-
if `ruby -Ilib ./bin/rdig --version` =~ /
|
24
|
+
if `ruby -Ilib ./bin/rdig --version` =~ /RDig version ([0-9.]+)$/
|
25
25
|
CURRENT_VERSION = $1
|
26
26
|
else
|
27
27
|
CURRENT_VERSION = "0.0.0"
|
@@ -131,10 +131,7 @@ else
|
|
131
131
|
#### Dependencies and requirements.
|
132
132
|
|
133
133
|
s.add_dependency('ferret', '>= 0.10.0')
|
134
|
-
|
135
|
-
# ORed dependencies...
|
136
|
-
#s.add_dependency('rubyful_soup', '>= 1.0.4')
|
137
|
-
s.add_dependency('hpricot', '>= 0.4')
|
134
|
+
s.add_dependency('hpricot', '>= 0.6')
|
138
135
|
#s.requirements << ""
|
139
136
|
|
140
137
|
#### Which files are to be included in this gem? Everything! (Except CVS directories.)
|
@@ -282,9 +279,9 @@ task :prerelease do
|
|
282
279
|
announce "Release Task Testing, skipping checked-in file test"
|
283
280
|
else
|
284
281
|
announce "Checking for unchecked-in files..."
|
285
|
-
data = `
|
286
|
-
unless data =~
|
287
|
-
fail "
|
282
|
+
data = `git status`
|
283
|
+
unless data =~ /working directory clean/
|
284
|
+
fail "GIT status is not clean ... do you have unchecked-in files?"
|
288
285
|
end
|
289
286
|
announce "No outstanding checkins found ... OK"
|
290
287
|
end
|
@@ -310,7 +307,8 @@ task :update_version => [:prerelease] do
|
|
310
307
|
if ENV['RELTEST']
|
311
308
|
announce "Release Task Testing, skipping commiting of new version"
|
312
309
|
else
|
313
|
-
sh %{
|
310
|
+
sh %{git commit -a -m "Updated to version #{PKG_VERSION}" lib/rdig.rb}
|
311
|
+
sh %{git svn dcommit}
|
314
312
|
end
|
315
313
|
end
|
316
314
|
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
class HttpDocumentTest < Test::Unit::TestCase
|
3
|
+
include TestHelper
|
4
|
+
|
5
|
+
def setup
|
6
|
+
@fixture_path = File.join(File.expand_path(File.dirname(__FILE__)), '../fixtures/')
|
7
|
+
end
|
8
|
+
|
9
|
+
def test_initialize
|
10
|
+
d = Document.create 'http://1stlineleewes.com'
|
11
|
+
assert_equal '1stlineleewes.com', d.uri.host
|
12
|
+
assert_equal '', d.uri.path
|
13
|
+
end
|
14
|
+
|
15
|
+
end
|
16
|
+
|
17
|
+
|
@@ -0,0 +1,38 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
class RDigTest < Test::Unit::TestCase
|
3
|
+
include TestHelper
|
4
|
+
|
5
|
+
def setup
|
6
|
+
RDig.configuration do |cfg|
|
7
|
+
@old_crawler_cfg = cfg.crawler.clone
|
8
|
+
cfg.log_level = :debug
|
9
|
+
cfg.log_file = 'tmp/test.log'
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
def teardown
|
14
|
+
RDig.configuration do |cfg|
|
15
|
+
cfg.crawler = @old_crawler_cfg
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def test_proxy_config
|
20
|
+
RDig.configuration do |cfg|
|
21
|
+
cfg.crawler.http_proxy = 'http://proxy.com:8080'
|
22
|
+
end
|
23
|
+
assert_equal 'http://proxy.com:8080', RDig.open_uri_http_options[:proxy]
|
24
|
+
assert_nil RDig.open_uri_http_options['Authorization']
|
25
|
+
end
|
26
|
+
|
27
|
+
def test_proxy_auth
|
28
|
+
RDig.configuration do |cfg|
|
29
|
+
cfg.crawler.http_proxy = 'http://proxy.com:8080'
|
30
|
+
cfg.crawler.http_proxy_user = 'username'
|
31
|
+
cfg.crawler.http_proxy_pass = 'password'
|
32
|
+
end
|
33
|
+
assert_equal 'http://proxy.com:8080', RDig.open_uri_http_options[:proxy]
|
34
|
+
assert_equal "Basic dXNlcm5hbWU6cGFzc3dvcmQ=\n", RDig.open_uri_http_options['Authorization']
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
|
data/test/unit/searcher_test.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
require 'test_helper'
|
2
|
-
class
|
2
|
+
class UrlFiltersTest < Test::Unit::TestCase
|
3
3
|
include TestHelper, RDig
|
4
4
|
|
5
5
|
def setup
|
@@ -73,24 +73,24 @@ class UrlFilterTest < Test::Unit::TestCase
|
|
73
73
|
def test_fix_relative_uri
|
74
74
|
doc = Document.create('http://test.host/dir/file.html')
|
75
75
|
assert_equal('http://test.host/dir/another.html',
|
76
|
-
UrlFilters.fix_relative_uri(
|
76
|
+
UrlFilters.fix_relative_uri(doc.create_child('another.html')).uri.to_s)
|
77
77
|
assert_equal('http://test.host/dir/../another.html',
|
78
|
-
UrlFilters.fix_relative_uri(
|
78
|
+
UrlFilters.fix_relative_uri(doc.create_child('../another.html')).uri.to_s)
|
79
79
|
assert_equal('http://test.host/dir/another.html',
|
80
|
-
UrlFilters.fix_relative_uri(
|
80
|
+
UrlFilters.fix_relative_uri(doc.create_child('/dir/another.html')).uri.to_s)
|
81
81
|
assert_equal('http://test.host/dir/another.html',
|
82
|
-
UrlFilters.fix_relative_uri(
|
82
|
+
UrlFilters.fix_relative_uri(doc.create_child('http://test.host/dir/another.html')).uri.to_s)
|
83
83
|
assert_equal('HTTP://test.host/dir/another.html',
|
84
|
-
UrlFilters.fix_relative_uri(
|
84
|
+
UrlFilters.fix_relative_uri(doc.create_child('HTTP://test.host/dir/another.html')).uri.to_s)
|
85
85
|
doc = Document.create('https://test.host/dir/')
|
86
86
|
assert_equal('https://test.host/dir/another.html',
|
87
|
-
UrlFilters.fix_relative_uri(
|
87
|
+
UrlFilters.fix_relative_uri(doc.create_child('another.html')).uri.to_s)
|
88
88
|
doc = Document.create('https://test.host/')
|
89
89
|
assert_equal('https://test.host/another.html',
|
90
|
-
UrlFilters.fix_relative_uri(
|
90
|
+
UrlFilters.fix_relative_uri(doc.create_child('another.html')).uri.to_s)
|
91
91
|
doc = Document.create('https://test.host')
|
92
92
|
assert_equal('https://test.host/another.html',
|
93
|
-
UrlFilters.fix_relative_uri(
|
93
|
+
UrlFilters.fix_relative_uri(doc.create_child('another.html')).uri.to_s)
|
94
94
|
end
|
95
95
|
end
|
96
96
|
|
metadata
CHANGED
@@ -1,74 +1,88 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
|
-
rubygems_version: 0.8.11
|
3
|
-
specification_version: 1
|
4
2
|
name: rdig
|
5
3
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.3.
|
7
|
-
date: 2006-12-31 00:00:00 +01:00
|
8
|
-
summary: Ruby based web site indexing and searching library.
|
9
|
-
require_paths:
|
10
|
-
- lib
|
11
|
-
email: jk@jkraemer.net
|
12
|
-
homepage: http://rdig.rubyforge.org/
|
13
|
-
rubyforge_project: rdig
|
14
|
-
description: RDig provides an HTTP crawler and content extraction utilities to help building a site search for web sites or intranets. Internally, Ferret is used for the full text indexing. After creating a config file for your site, the index can be built with a single call to rdig. For HTML page crawling, hpricot and rubyful_soup are supported.
|
15
|
-
autorequire:
|
16
|
-
default_executable: rdig
|
17
|
-
bindir: bin
|
18
|
-
has_rdoc: true
|
19
|
-
required_ruby_version: !ruby/object:Gem::Version::Requirement
|
20
|
-
requirements:
|
21
|
-
- - ">"
|
22
|
-
- !ruby/object:Gem::Version
|
23
|
-
version: 0.0.0
|
24
|
-
version:
|
4
|
+
version: 0.3.5
|
25
5
|
platform: ruby
|
26
|
-
signing_key:
|
27
|
-
cert_chain:
|
28
6
|
authors:
|
29
7
|
- Jens Kraemer
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2008-02-26 00:00:00 +01:00
|
13
|
+
default_executable: rdig
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: ferret
|
17
|
+
version_requirement:
|
18
|
+
version_requirements: !ruby/object:Gem::Requirement
|
19
|
+
requirements:
|
20
|
+
- - ">="
|
21
|
+
- !ruby/object:Gem::Version
|
22
|
+
version: 0.10.0
|
23
|
+
version:
|
24
|
+
- !ruby/object:Gem::Dependency
|
25
|
+
name: hpricot
|
26
|
+
version_requirement:
|
27
|
+
version_requirements: !ruby/object:Gem::Requirement
|
28
|
+
requirements:
|
29
|
+
- - ">="
|
30
|
+
- !ruby/object:Gem::Version
|
31
|
+
version: "0.6"
|
32
|
+
version:
|
33
|
+
description: RDig provides an HTTP crawler and content extraction utilities to help building a site search for web sites or intranets. Internally, Ferret is used for the full text indexing. After creating a config file for your site, the index can be built with a single call to rdig. For HTML page crawling, hpricot and rubyful_soup are supported.
|
34
|
+
email: jk@jkraemer.net
|
35
|
+
executables:
|
36
|
+
- rdig
|
37
|
+
extensions: []
|
38
|
+
|
39
|
+
extra_rdoc_files:
|
40
|
+
- README
|
41
|
+
- CHANGES
|
42
|
+
- LICENSE
|
43
|
+
- TODO
|
30
44
|
files:
|
31
45
|
- bin/rdig
|
32
|
-
- lib/rdig
|
33
|
-
- lib/htmlentities
|
34
46
|
- lib/rdig.rb
|
35
|
-
- lib/rdig
|
36
|
-
- lib/rdig/crawler.rb
|
37
|
-
- lib/rdig/search.rb
|
38
|
-
- lib/rdig/highlight.rb
|
39
|
-
- lib/rdig/index.rb
|
47
|
+
- lib/rdig
|
40
48
|
- lib/rdig/url_filters.rb
|
49
|
+
- lib/rdig/index.rb
|
50
|
+
- lib/rdig/crawler.rb
|
41
51
|
- lib/rdig/content_extractors.rb
|
42
|
-
- lib/rdig/documents.rb
|
43
52
|
- lib/rdig/file.rb
|
44
|
-
- lib/rdig/
|
53
|
+
- lib/rdig/highlight.rb
|
54
|
+
- lib/rdig/documents.rb
|
55
|
+
- lib/rdig/search.rb
|
56
|
+
- lib/rdig/content_extractors
|
45
57
|
- lib/rdig/content_extractors/doc.rb
|
46
58
|
- lib/rdig/content_extractors/hpricot.rb
|
47
59
|
- lib/rdig/content_extractors/pdf.rb
|
48
|
-
- lib/htmlentities
|
60
|
+
- lib/htmlentities
|
61
|
+
- lib/htmlentities/htmlentities.rb
|
49
62
|
- lib/htmlentities/COPYING
|
63
|
+
- lib/htmlentities/CHANGES
|
50
64
|
- lib/htmlentities/README
|
51
|
-
- lib/htmlentities/htmlentities.rb
|
52
|
-
- test/unit
|
53
65
|
- test/fixtures
|
54
|
-
- test/test_helper.rb
|
55
|
-
- test/unit/etag_filter_test.rb
|
56
|
-
- test/unit/url_filters_test.rb
|
57
|
-
- test/unit/searcher_test.rb
|
58
|
-
- test/unit/rubyful_soup_content_extractor_test.rb
|
59
|
-
- test/unit/pdf_content_extractor_test.rb
|
60
|
-
- test/unit/hpricot_content_extractor_test.rb
|
61
|
-
- test/unit/word_content_extractor_test.rb
|
62
|
-
- test/unit/file_document_test.rb
|
63
|
-
- test/unit/crawler_fs_test.rb
|
64
|
-
- test/fixtures/html
|
65
|
-
- test/fixtures/pdf
|
66
66
|
- test/fixtures/word
|
67
|
-
- test/fixtures/
|
68
|
-
- test/fixtures/html
|
67
|
+
- test/fixtures/word/simple.doc
|
68
|
+
- test/fixtures/html
|
69
69
|
- test/fixtures/html/custom_tag_selectors.html
|
70
|
+
- test/fixtures/html/simple.html
|
71
|
+
- test/fixtures/html/entities.html
|
72
|
+
- test/fixtures/pdf
|
70
73
|
- test/fixtures/pdf/simple.pdf
|
71
|
-
- test/
|
74
|
+
- test/unit
|
75
|
+
- test/unit/crawler_fs_test.rb
|
76
|
+
- test/unit/pdf_content_extractor_test.rb
|
77
|
+
- test/unit/word_content_extractor_test.rb
|
78
|
+
- test/unit/rdig_test.rb
|
79
|
+
- test/unit/http_document_test.rb
|
80
|
+
- test/unit/searcher_test.rb
|
81
|
+
- test/unit/file_document_test.rb
|
82
|
+
- test/unit/url_filters_test.rb
|
83
|
+
- test/unit/hpricot_content_extractor_test.rb
|
84
|
+
- test/unit/etag_filter_test.rb
|
85
|
+
- test/test_helper.rb
|
72
86
|
- doc/examples
|
73
87
|
- doc/examples/config.rb
|
74
88
|
- LICENSE
|
@@ -77,41 +91,35 @@ files:
|
|
77
91
|
- README
|
78
92
|
- install.rb
|
79
93
|
- rakefile
|
80
|
-
|
81
|
-
|
94
|
+
has_rdoc: true
|
95
|
+
homepage: http://rdig.rubyforge.org/
|
96
|
+
post_install_message:
|
82
97
|
rdoc_options:
|
83
98
|
- --title
|
84
99
|
- Rake -- Ruby Make
|
85
100
|
- --main
|
86
101
|
- README
|
87
102
|
- --line-numbers
|
88
|
-
|
89
|
-
-
|
90
|
-
|
91
|
-
|
92
|
-
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
103
|
+
require_paths:
|
104
|
+
- lib
|
105
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
106
|
+
requirements:
|
107
|
+
- - ">="
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: "0"
|
110
|
+
version:
|
111
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
112
|
+
requirements:
|
113
|
+
- - ">="
|
114
|
+
- !ruby/object:Gem::Version
|
115
|
+
version: "0"
|
116
|
+
version:
|
97
117
|
requirements: []
|
98
118
|
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
- !ruby/object:Gem::Version
|
107
|
-
version: 0.10.0
|
108
|
-
version:
|
109
|
-
- !ruby/object:Gem::Dependency
|
110
|
-
name: hpricot
|
111
|
-
version_requirement:
|
112
|
-
version_requirements: !ruby/object:Gem::Version::Requirement
|
113
|
-
requirements:
|
114
|
-
- - ">="
|
115
|
-
- !ruby/object:Gem::Version
|
116
|
-
version: "0.4"
|
117
|
-
version:
|
119
|
+
rubyforge_project: rdig
|
120
|
+
rubygems_version: 1.0.1
|
121
|
+
signing_key:
|
122
|
+
specification_version: 2
|
123
|
+
summary: Ruby based web site indexing and searching library.
|
124
|
+
test_files: []
|
125
|
+
|
@@ -1,151 +0,0 @@
|
|
1
|
-
begin
|
2
|
-
require 'rubyful_soup'
|
3
|
-
rescue LoadError
|
4
|
-
require 'rubygems'
|
5
|
-
require 'rubyful_soup' rescue nil
|
6
|
-
end
|
7
|
-
|
8
|
-
if defined?(BeautifulSoup)
|
9
|
-
|
10
|
-
# override some methods concered with entity resolving
|
11
|
-
# to convert them to strings
|
12
|
-
class BeautifulStoneSoup
|
13
|
-
# resolve unknown html entities using the htmlentities lib
|
14
|
-
alias :orig_unknown_entityref :unknown_entityref
|
15
|
-
def unknown_entityref(ref)
|
16
|
-
if HTMLEntities::MAP.has_key?(ref)
|
17
|
-
handle_data [HTMLEntities::MAP[ref]].pack('U')
|
18
|
-
else
|
19
|
-
orig_unknown_entityref ref
|
20
|
-
end
|
21
|
-
end
|
22
|
-
|
23
|
-
# resolve numeric entities to utf8
|
24
|
-
def handle_charref(ref)
|
25
|
-
handle_data( ref.gsub(/([0-9]{1,7})/) {
|
26
|
-
[$1.to_i].pack('U')
|
27
|
-
}.gsub(/x([0-9a-f]{1,6})/i) {
|
28
|
-
[$1.to_i(16)].pack('U')
|
29
|
-
} )
|
30
|
-
end
|
31
|
-
end
|
32
|
-
|
33
|
-
module RDig
|
34
|
-
module ContentExtractors
|
35
|
-
|
36
|
-
# extracts title, content and links from html documents
|
37
|
-
class RubyfulSoupContentExtractor < ContentExtractor
|
38
|
-
|
39
|
-
def initialize(config)
|
40
|
-
super(config.rubyful_soup)
|
41
|
-
# if not configured, refuse to handle any content:
|
42
|
-
@pattern = /^(text\/(html|xml)|application\/(xhtml\+xml|xml))/ if config.rubyful_soup
|
43
|
-
end
|
44
|
-
|
45
|
-
# returns:
|
46
|
-
# { :content => 'extracted clear text',
|
47
|
-
# :meta => { :title => 'Title' },
|
48
|
-
# :links => [array of urls] }
|
49
|
-
def process(content)
|
50
|
-
result = { }
|
51
|
-
tag_soup = BeautifulSoup.new(content)
|
52
|
-
result[:title] = extract_title(tag_soup)
|
53
|
-
result[:links] = extract_links(tag_soup)
|
54
|
-
result[:content] = extract_content(tag_soup)
|
55
|
-
return result
|
56
|
-
end
|
57
|
-
|
58
|
-
# Extracts textual content from the HTML tree.
|
59
|
-
#
|
60
|
-
# - First, the root element to use is determined using the
|
61
|
-
# +content_element+ method, which itself uses the content_tag_selector
|
62
|
-
# from RDig.configuration.
|
63
|
-
# - Then, this element is processed by +extract_text+, which will give
|
64
|
-
# all textual content contained in the root element and all it's
|
65
|
-
# children.
|
66
|
-
def extract_content(tag_soup)
|
67
|
-
content = ''
|
68
|
-
ce = content_element(tag_soup)
|
69
|
-
ce.children { |child|
|
70
|
-
extract_text(child, content)
|
71
|
-
} unless ce.nil?
|
72
|
-
return content.strip
|
73
|
-
end
|
74
|
-
|
75
|
-
# extracts the href attributes of all a tags, except
|
76
|
-
# internal links like <a href="#top">
|
77
|
-
def extract_links(tagsoup)
|
78
|
-
tagsoup.find_all('a').map { |link|
|
79
|
-
CGI.unescapeHTML(link['href']) if (link['href'] && link['href'] !~ /^#/)
|
80
|
-
}.compact
|
81
|
-
end
|
82
|
-
|
83
|
-
# Extracts the title from the given html tree
|
84
|
-
def extract_title(tagsoup)
|
85
|
-
the_title_tag = title_tag(tagsoup)
|
86
|
-
if the_title_tag.is_a? String
|
87
|
-
the_title_tag
|
88
|
-
else
|
89
|
-
title = ''
|
90
|
-
extract_text(the_title_tag, title)
|
91
|
-
title.strip
|
92
|
-
end
|
93
|
-
end
|
94
|
-
|
95
|
-
# Recursively extracts all text contained in the given element,
|
96
|
-
# and appends it to content.
|
97
|
-
def extract_text(element, content='')
|
98
|
-
return nil if element.nil?
|
99
|
-
if element.is_a? NavigableString
|
100
|
-
value = strip_comments(element)
|
101
|
-
value.strip!
|
102
|
-
unless value.empty?
|
103
|
-
content << value
|
104
|
-
content << ' '
|
105
|
-
end
|
106
|
-
elsif element.string # it's a Tag, and it has some content string
|
107
|
-
# skip inline scripts and styles
|
108
|
-
return nil if element.name =~ /^(script|style)$/i
|
109
|
-
value = element.string.strip
|
110
|
-
unless value.empty?
|
111
|
-
content << value
|
112
|
-
content << ' '
|
113
|
-
end
|
114
|
-
else
|
115
|
-
element.children { |child|
|
116
|
-
extract_text(child, content)
|
117
|
-
}
|
118
|
-
end
|
119
|
-
end
|
120
|
-
|
121
|
-
# Returns the element to extract the title from.
|
122
|
-
#
|
123
|
-
# This may return a string, e.g. an attribute value selected from a meta
|
124
|
-
# tag, too.
|
125
|
-
def title_tag(tagsoup)
|
126
|
-
if @config.title_tag_selector
|
127
|
-
@config.title_tag_selector.call(tagsoup)
|
128
|
-
else
|
129
|
-
tagsoup.html.head.title
|
130
|
-
end
|
131
|
-
end
|
132
|
-
|
133
|
-
# Retrieve the root element to extract document content from
|
134
|
-
def content_element(tagsoup)
|
135
|
-
if @config.content_tag_selector
|
136
|
-
@config.content_tag_selector.call(tagsoup)
|
137
|
-
else
|
138
|
-
tagsoup.html.body
|
139
|
-
end
|
140
|
-
end
|
141
|
-
|
142
|
-
# Return the given string minus all html comments
|
143
|
-
def strip_comments(string)
|
144
|
-
string.gsub(Regexp.new('<!--.*?-->', Regexp::MULTILINE, 'u'), '')
|
145
|
-
end
|
146
|
-
end
|
147
|
-
|
148
|
-
end
|
149
|
-
end
|
150
|
-
|
151
|
-
end
|
@@ -1,83 +0,0 @@
|
|
1
|
-
require 'test_helper'
|
2
|
-
class RubyfulSoupContentExtractorTest < Test::Unit::TestCase
|
3
|
-
include TestHelper
|
4
|
-
|
5
|
-
def setup
|
6
|
-
@config = OpenStruct.new(
|
7
|
-
:content_tag_selector => lambda { |tagsoup|
|
8
|
-
tagsoup.html.body
|
9
|
-
},
|
10
|
-
:title_tag_selector => lambda { |tagsoup|
|
11
|
-
tagsoup.html.head.title
|
12
|
-
})
|
13
|
-
@extractor = ContentExtractors::RubyfulSoupContentExtractor.new(OpenStruct.new(:rubyful_soup => @config))
|
14
|
-
@nbsp = [160].pack('U') # non breaking space
|
15
|
-
end
|
16
|
-
|
17
|
-
def test_can_do
|
18
|
-
assert !@extractor.can_do('application/pdf')
|
19
|
-
assert !@extractor.can_do('application/msword')
|
20
|
-
assert @extractor.can_do('text/html')
|
21
|
-
assert @extractor.can_do('text/xml')
|
22
|
-
assert @extractor.can_do('application/xml')
|
23
|
-
assert @extractor.can_do('application/xhtml+xml')
|
24
|
-
end
|
25
|
-
|
26
|
-
def test_simple
|
27
|
-
result = ContentExtractors.process(html_doc('simple'), 'text/html')
|
28
|
-
assert_not_nil result
|
29
|
-
assert_equal 'Sample Title', result[:title]
|
30
|
-
assert_not_nil result[:content]
|
31
|
-
assert_not_nil result[:links]
|
32
|
-
assert_equal 1, result[:links].size
|
33
|
-
assert_equal 'A Link Affe Some sample text Lorem ipsum', result[:content]
|
34
|
-
assert_equal 'http://test.host/affe.html', result[:links].first
|
35
|
-
end
|
36
|
-
|
37
|
-
def test_entities
|
38
|
-
result = @extractor.process(html_doc('entities'))
|
39
|
-
assert_equal 'Sample & Title', result[:title]
|
40
|
-
assert_equal 'http://test.host/affe.html?b=a&c=d', result[:links].first
|
41
|
-
assert_equal 'http://test.host/affe2.html?b=a&c=d', result[:links].last
|
42
|
-
assert_equal "Some > Links don't#{@nbsp}break me! Affe Affe Ümläuts heiß hier ß", result[:content]
|
43
|
-
end
|
44
|
-
|
45
|
-
def test_custom_content_element
|
46
|
-
@config.title_tag_selector = lambda do |tagsoup|
|
47
|
-
tagsoup.find('h1', :attrs => { 'class', 'title' })
|
48
|
-
end
|
49
|
-
@config.content_tag_selector = lambda do |tagsoup|
|
50
|
-
tagsoup.find('div', :attrs => { 'id', 'content' })
|
51
|
-
end
|
52
|
-
result = @extractor.process(html_doc('custom_tag_selectors'))
|
53
|
-
assert_equal 'Sample Title in h1', result[:title]
|
54
|
-
assert_equal 'Affe Real content is here.', result[:content]
|
55
|
-
# check if links are collected outside the content tag, too:
|
56
|
-
assert_equal 3, result[:links].size
|
57
|
-
assert_equal 'http://test.host/outside.html', result[:links].first
|
58
|
-
assert_equal '/inside.html', result[:links][1]
|
59
|
-
assert_equal '/footer.html', result[:links][2]
|
60
|
-
end
|
61
|
-
|
62
|
-
|
63
|
-
def test_title_from_dcmeta
|
64
|
-
@config.title_tag_selector = lambda do |tagsoup|
|
65
|
-
tagsoup.find('meta', :attrs => { 'name', 'DC.title' })['content']
|
66
|
-
end
|
67
|
-
result = @extractor.process(html_doc('custom_tag_selectors'))
|
68
|
-
assert_equal 'Title from DC meta data', result[:title]
|
69
|
-
end
|
70
|
-
|
71
|
-
def test_preprocessed_title
|
72
|
-
@config.title_tag_selector = lambda do |tagsoup|
|
73
|
-
title = tagsoup.find('meta', :attrs => { 'name', 'DC.title' })['content']
|
74
|
-
# use only a portion of the title tag's contents if it matches our
|
75
|
-
# regexp:
|
76
|
-
title =~ /^(.*)meta data$/ ? $1.strip : title.strip
|
77
|
-
end
|
78
|
-
result = @extractor.process(html_doc('custom_tag_selectors'))
|
79
|
-
assert_equal 'Title from DC', result[:title]
|
80
|
-
end
|
81
|
-
|
82
|
-
end
|
83
|
-
|