rdig 0.3.9 → 0.3.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +1 -1
- data/{README → README.rdoc} +1 -1
- data/bin/rdig +0 -0
- data/lib/rdig.rb +6 -7
- data/lib/rdig/content_extractors.rb +4 -4
- data/lib/rdig/content_extractors/pdf.rb +3 -3
- data/lib/rdig/crawler.rb +17 -14
- data/lib/rdig/documents.rb +8 -8
- data/lib/rdig/index.rb +9 -5
- data/lib/rdig/search.rb +19 -9
- data/lib/rdig/url_filters.rb +26 -7
- metadata +4 -4
data/LICENSE
CHANGED
data/{README → README.rdoc}
RENAMED
@@ -25,7 +25,7 @@ manually do a +gem install rubyful_soup+.
|
|
25
25
|
=== Handle search in your application:
|
26
26
|
require 'rdig'
|
27
27
|
require 'rdig_config' # load your config file here
|
28
|
-
search_results = RDig.searcher.search(query
|
28
|
+
search_results = RDig.searcher.search(query)
|
29
29
|
|
30
30
|
see RDig::Search::Searcher for more information.
|
31
31
|
|
data/bin/rdig
CHANGED
File without changes
|
data/lib/rdig.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
3
|
#--
|
4
|
-
# Copyright (c) 2006 Jens Kraemer
|
4
|
+
# Copyright (c) 2006-2012 Jens Kraemer, jk@jkraemer.net
|
5
5
|
#
|
6
6
|
# Permission is hereby granted, free of charge, to any person obtaining
|
7
7
|
# a copy of this software and associated documentation files (the
|
@@ -84,7 +84,7 @@ module RDig
|
|
84
84
|
{ RDig::UrlFilters::PathExclusionFilter => :exclude_documents }
|
85
85
|
]
|
86
86
|
}
|
87
|
-
|
87
|
+
|
88
88
|
end
|
89
89
|
|
90
90
|
def application
|
@@ -150,7 +150,7 @@ module RDig
|
|
150
150
|
end
|
151
151
|
end
|
152
152
|
alias config configuration
|
153
|
-
|
153
|
+
|
154
154
|
def logger
|
155
155
|
@logger ||= create_logger
|
156
156
|
end
|
@@ -200,12 +200,12 @@ module RDig
|
|
200
200
|
def options
|
201
201
|
@options ||= OpenStruct.new
|
202
202
|
end
|
203
|
-
|
203
|
+
|
204
204
|
# Display the program usage line.
|
205
205
|
def usage
|
206
206
|
puts "rdig -c configfile {options}"
|
207
207
|
end
|
208
|
-
|
208
|
+
|
209
209
|
# Display the rake command line help.
|
210
210
|
def help
|
211
211
|
usage
|
@@ -266,8 +266,7 @@ module RDig
|
|
266
266
|
rescue
|
267
267
|
puts $!.backtrace
|
268
268
|
fail "No Configfile found!\n#{$!}"
|
269
|
-
|
270
|
-
end
|
269
|
+
end
|
271
270
|
|
272
271
|
puts "using Ferret #{Ferret::VERSION}"
|
273
272
|
|
@@ -1,5 +1,5 @@
|
|
1
1
|
module RDig
|
2
|
-
|
2
|
+
|
3
3
|
# Contains classes which are used for extracting content and meta data from
|
4
4
|
# various content types.
|
5
5
|
module ContentExtractors
|
@@ -13,7 +13,7 @@ module RDig
|
|
13
13
|
# Extractors inheriting from this class will be auto-discovered and used
|
14
14
|
# when can_do returns true
|
15
15
|
class ContentExtractor
|
16
|
-
|
16
|
+
|
17
17
|
def self.inherited(extractor)
|
18
18
|
super(extractor)
|
19
19
|
self.extractors << extractor
|
@@ -32,7 +32,7 @@ module RDig
|
|
32
32
|
ex
|
33
33
|
}.compact
|
34
34
|
end
|
35
|
-
|
35
|
+
|
36
36
|
def self.process(content, content_type)
|
37
37
|
self.extractor_instances.each { |extractor|
|
38
38
|
return extractor.process(content) if extractor.can_do(content_type)
|
@@ -61,7 +61,7 @@ module RDig
|
|
61
61
|
end
|
62
62
|
result
|
63
63
|
end
|
64
|
-
|
64
|
+
|
65
65
|
def as_file(content)
|
66
66
|
file = Tempfile.new('rdig')
|
67
67
|
file << content
|
@@ -8,7 +8,7 @@ module RDig
|
|
8
8
|
#
|
9
9
|
class PdfContentExtractor < ContentExtractor
|
10
10
|
include ExternalAppHelper
|
11
|
-
|
11
|
+
|
12
12
|
def initialize(config)
|
13
13
|
super(config)
|
14
14
|
@pattern = /^application\/pdf/
|
@@ -22,7 +22,7 @@ module RDig
|
|
22
22
|
end
|
23
23
|
}
|
24
24
|
end
|
25
|
-
|
25
|
+
|
26
26
|
def process(content)
|
27
27
|
result = {}
|
28
28
|
as_file(content) do |file|
|
@@ -35,7 +35,7 @@ module RDig
|
|
35
35
|
def get_content(path_to_tempfile)
|
36
36
|
%x{#{@pdftotext} -enc UTF-8 '#{path_to_tempfile}' -}
|
37
37
|
end
|
38
|
-
|
38
|
+
|
39
39
|
# extracts the title from pdf meta data
|
40
40
|
# needs pdfinfo
|
41
41
|
# returns the title or nil if no title was found
|
data/lib/rdig/crawler.rb
CHANGED
@@ -1,12 +1,11 @@
|
|
1
1
|
module RDig
|
2
|
-
|
3
|
-
|
4
2
|
class Crawler
|
5
|
-
|
3
|
+
|
6
4
|
def initialize(config = RDig.config, logger = RDig.logger)
|
7
5
|
@documents = Queue.new
|
8
6
|
@logger = logger
|
9
7
|
@config = config
|
8
|
+
@indexed_documents = 0
|
10
9
|
end
|
11
10
|
|
12
11
|
def run
|
@@ -22,6 +21,7 @@ module RDig
|
|
22
21
|
url_type = @config.crawler.start_urls.first =~ /^file:\/\// ? :file : :http
|
23
22
|
chain_config = RDig.filter_chain[url_type]
|
24
23
|
|
24
|
+
# the etag filter operates on the fetched document, thats why we cannot put it into the filter chain right now.
|
25
25
|
@etag_filter = ETagFilter.new
|
26
26
|
filterchain = UrlFilters::FilterChain.new(chain_config)
|
27
27
|
@config.crawler.start_urls.each { |url| add_url(url, filterchain) }
|
@@ -31,9 +31,11 @@ module RDig
|
|
31
31
|
num_threads.times { |i|
|
32
32
|
group.join_nowait Thread.new("fetcher #{i}") {
|
33
33
|
filterchain = UrlFilters::FilterChain.new(chain_config)
|
34
|
+
@logger.info "thread #{i} running..."
|
34
35
|
while (doc = @documents.pop) != :exit
|
35
36
|
process_document doc, filterchain
|
36
37
|
end
|
38
|
+
@logger.info "thread #{i} is done."
|
37
39
|
}
|
38
40
|
}
|
39
41
|
|
@@ -47,20 +49,21 @@ module RDig
|
|
47
49
|
|
48
50
|
@logger.info "waiting for threads to finish..."
|
49
51
|
group.all_waits
|
52
|
+
@logger.info "indexed #{@indexer.indexed_documents} documents"
|
50
53
|
end
|
51
54
|
|
52
55
|
def process_document(doc, filterchain)
|
53
|
-
@logger.
|
56
|
+
@logger.info "processing document #{doc.uri}"
|
54
57
|
doc.fetch
|
55
58
|
case doc.status
|
56
59
|
when :success
|
57
|
-
if @etag_filter.apply(doc)
|
60
|
+
if @etag_filter.apply(doc)
|
58
61
|
# add links from this document to the queue
|
59
62
|
doc.content[:links].each { |url|
|
60
63
|
add_url(url, filterchain, doc)
|
61
64
|
} unless doc.content[:links].nil?
|
62
65
|
add_to_index doc
|
63
|
-
end
|
66
|
+
end
|
64
67
|
when :redirect
|
65
68
|
@logger.debug "redirect to #{doc.content}"
|
66
69
|
add_url(doc.content, filterchain, doc)
|
@@ -69,14 +72,16 @@ module RDig
|
|
69
72
|
end
|
70
73
|
rescue
|
71
74
|
@logger.error "error processing document #{doc.uri.to_s}: #{$!}"
|
72
|
-
@logger.
|
75
|
+
@logger.info "Trace: #{$!.backtrace.join("\n")}"
|
73
76
|
end
|
74
77
|
|
75
78
|
def add_to_index(doc)
|
76
|
-
|
79
|
+
if doc.needs_indexing?
|
80
|
+
@indexer << doc
|
81
|
+
end
|
77
82
|
end
|
78
83
|
|
79
|
-
|
84
|
+
|
80
85
|
# pipes a new document pointing to url through the filter chain,
|
81
86
|
# if it survives that, it gets added to the documents queue for further
|
82
87
|
# processing
|
@@ -90,19 +95,17 @@ module RDig
|
|
90
95
|
Document.create(url)
|
91
96
|
end
|
92
97
|
|
93
|
-
doc = filterchain.apply(doc)
|
94
|
-
|
95
|
-
if doc
|
98
|
+
if doc = filterchain.apply(doc)
|
96
99
|
@documents << doc
|
97
100
|
@logger.debug "url #{url} survived filterchain"
|
98
101
|
end
|
99
102
|
rescue
|
100
103
|
nil
|
101
104
|
end
|
102
|
-
|
105
|
+
|
103
106
|
end
|
104
107
|
|
105
|
-
|
108
|
+
|
106
109
|
# checks fetched documents' E-Tag headers against the list of E-Tags
|
107
110
|
# of the documents already indexed.
|
108
111
|
# This is supposed to help against double-indexing documents which can
|
data/lib/rdig/documents.rb
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
module RDig
|
2
|
-
|
2
|
+
|
3
3
|
#
|
4
4
|
# Document base class
|
5
5
|
#
|
6
6
|
class Document
|
7
|
-
|
7
|
+
|
8
8
|
attr_reader :uri
|
9
9
|
attr_reader :content
|
10
10
|
attr_reader :content_type
|
11
|
-
|
11
|
+
|
12
12
|
def self.create(url)
|
13
13
|
return case url
|
14
14
|
when /^https?:\/\//i
|
@@ -32,7 +32,7 @@ module RDig
|
|
32
32
|
def title; @content[:title] end
|
33
33
|
def body; @content[:content] end
|
34
34
|
def links; @content[:links] end
|
35
|
-
|
35
|
+
|
36
36
|
def needs_indexing?
|
37
37
|
has_content? && (title || body)
|
38
38
|
end
|
@@ -47,7 +47,7 @@ module RDig
|
|
47
47
|
|
48
48
|
end
|
49
49
|
|
50
|
-
|
50
|
+
|
51
51
|
#
|
52
52
|
# Document in a File system
|
53
53
|
#
|
@@ -90,8 +90,8 @@ module RDig
|
|
90
90
|
end
|
91
91
|
|
92
92
|
end
|
93
|
-
|
94
|
-
|
93
|
+
|
94
|
+
|
95
95
|
#
|
96
96
|
# Remote Document to be retrieved by HTTP
|
97
97
|
#
|
@@ -106,7 +106,7 @@ module RDig
|
|
106
106
|
def create_child(uri)
|
107
107
|
HttpDocument.new(:uri => uri, :referrer => self.uri, :depth => self.depth+1) unless uri =~ /^file:\/\//i
|
108
108
|
end
|
109
|
-
|
109
|
+
|
110
110
|
# url: url of this document, may be relative to the referring doc or host.
|
111
111
|
# referrer: uri of the document we retrieved this link from
|
112
112
|
def initialize(args={})
|
data/lib/rdig/index.rb
CHANGED
@@ -1,11 +1,14 @@
|
|
1
1
|
module RDig
|
2
2
|
module Index
|
3
|
-
|
3
|
+
|
4
4
|
# used by the crawler to build the ferret index
|
5
5
|
class Indexer
|
6
6
|
include MonitorMixin
|
7
|
-
|
7
|
+
|
8
|
+
attr_reader :indexed_documents
|
9
|
+
|
8
10
|
def initialize(settings)
|
11
|
+
@indexed_documents = 0
|
9
12
|
@config = settings
|
10
13
|
@index_writer = Ferret::Index::IndexWriter.new(
|
11
14
|
:path => settings.path,
|
@@ -13,7 +16,7 @@ module RDig
|
|
13
16
|
:analyzer => settings.analyzer)
|
14
17
|
super() # scary, MonitorMixin won't initialize if we don't call super() here (parens matter)
|
15
18
|
end
|
16
|
-
|
19
|
+
|
17
20
|
def add_to_index(document)
|
18
21
|
RDig.logger.debug "add to index: #{document.uri.to_s}"
|
19
22
|
@config.rewrite_uri.call(document.uri) if @config.rewrite_uri
|
@@ -25,16 +28,17 @@ module RDig
|
|
25
28
|
}
|
26
29
|
synchronize do
|
27
30
|
@index_writer << doc
|
31
|
+
@indexed_documents += 1
|
28
32
|
end
|
29
33
|
end
|
30
34
|
alias :<< :add_to_index
|
31
|
-
|
35
|
+
|
32
36
|
def close
|
33
37
|
@index_writer.optimize
|
34
38
|
@index_writer.close
|
35
39
|
@index_writer = nil
|
36
40
|
end
|
37
41
|
end
|
38
|
-
|
42
|
+
|
39
43
|
end
|
40
44
|
end
|
data/lib/rdig/search.rb
CHANGED
@@ -5,17 +5,17 @@ module RDig
|
|
5
5
|
# Call RDig::searcher to retrieve an instance ready for use.
|
6
6
|
class Searcher
|
7
7
|
include Ferret::Search
|
8
|
-
|
8
|
+
|
9
9
|
# the query parser used to parse query strings
|
10
10
|
attr_reader :query_parser
|
11
|
-
|
11
|
+
|
12
12
|
# takes the ferret section of the rdig configuration as a parameter.
|
13
13
|
def initialize(settings)
|
14
14
|
@ferret_config = settings
|
15
15
|
@query_parser = Ferret::QueryParser.new(settings.marshal_dump)
|
16
16
|
ferret_searcher
|
17
17
|
end
|
18
|
-
|
18
|
+
|
19
19
|
# returns the Ferret::Search::IndexSearcher instance used internally.
|
20
20
|
def ferret_searcher
|
21
21
|
if @ferret_searcher and !@ferret_searcher.reader.latest?
|
@@ -29,7 +29,14 @@ module RDig
|
|
29
29
|
end
|
30
30
|
@ferret_searcher
|
31
31
|
end
|
32
|
-
|
32
|
+
|
33
|
+
def get_maximum_score(query, options)
|
34
|
+
ferret_searcher.search_each(query, options.merge(:limit => 1, :offset => 0)) do |doc_id, score|
|
35
|
+
return score
|
36
|
+
end
|
37
|
+
0
|
38
|
+
end
|
39
|
+
|
33
40
|
# run a search.
|
34
41
|
# +query+ usually will be a user-entered string. See the Ferret query
|
35
42
|
# language[http://ferret.davebalmain.com/api/classes/Ferret/QueryParser.html]
|
@@ -46,23 +53,26 @@ module RDig
|
|
46
53
|
RDig.logger.info "Query: #{query}"
|
47
54
|
results = []
|
48
55
|
searcher = ferret_searcher
|
56
|
+
maximum_score = get_maximum_score query, options
|
49
57
|
result[:hitcount] = searcher.search_each(query, options) do |doc_id, score|
|
50
58
|
doc = searcher[doc_id]
|
51
59
|
results << { :score => score,
|
52
60
|
:title => doc[:title],
|
53
61
|
:url => doc[:url],
|
54
|
-
:extract => build_extract(doc[:data])
|
62
|
+
:extract => build_extract(doc[:data]),
|
63
|
+
:relative_score => (score / maximum_score)
|
64
|
+
}
|
55
65
|
end
|
56
66
|
result[:list] = results
|
57
67
|
result
|
58
68
|
end
|
59
|
-
|
69
|
+
|
60
70
|
def build_extract(data)
|
61
71
|
(data && data.length > 200) ? data[0..200] : data
|
62
72
|
end
|
63
|
-
|
73
|
+
|
64
74
|
end
|
65
|
-
|
75
|
+
|
66
76
|
# class SearchResult < OpenStruct
|
67
77
|
# def initialize(doc, score)
|
68
78
|
# self.score = score
|
@@ -72,6 +82,6 @@ module RDig
|
|
72
82
|
# end
|
73
83
|
# end
|
74
84
|
|
75
|
-
|
85
|
+
|
76
86
|
end
|
77
87
|
end
|
data/lib/rdig/url_filters.rb
CHANGED
@@ -1,9 +1,10 @@
|
|
1
1
|
module RDig
|
2
|
-
|
2
|
+
|
3
3
|
module UrlFilters
|
4
4
|
|
5
5
|
class FilterChain
|
6
6
|
def initialize(chain_config)
|
7
|
+
@logger = RDig.logger
|
7
8
|
@filters = []
|
8
9
|
chain_config.each { |filter|
|
9
10
|
case filter
|
@@ -29,11 +30,23 @@ module RDig
|
|
29
30
|
when Symbol
|
30
31
|
if args.nil?
|
31
32
|
@filters << lambda { |document|
|
32
|
-
|
33
|
+
begin
|
34
|
+
UrlFilters.send(filter, document)
|
35
|
+
rescue Exception
|
36
|
+
@logger.error "error in URL filter #{filter}: #{$!}"
|
37
|
+
@logger.error $!.backtrace.join("\n")
|
38
|
+
nil
|
39
|
+
end
|
33
40
|
}
|
34
41
|
else
|
35
42
|
@filters << lambda { |document|
|
36
|
-
|
43
|
+
begin
|
44
|
+
UrlFilters.send(filter, document, args)
|
45
|
+
rescue Exception
|
46
|
+
@logger.error "error in URL filter #{filter}: #{$!}"
|
47
|
+
@logger.error $!.backtrace.join("\n")
|
48
|
+
nil
|
49
|
+
end
|
37
50
|
}
|
38
51
|
end
|
39
52
|
when Class
|
@@ -54,7 +67,13 @@ module RDig
|
|
54
67
|
|
55
68
|
def apply(document)
|
56
69
|
@filters.each { |filter|
|
57
|
-
|
70
|
+
@logger.debug "running filter #{filter.inspect} on doc #{document.uri}"
|
71
|
+
unless filter.call(document)
|
72
|
+
@logger.debug "fail"
|
73
|
+
return nil
|
74
|
+
else
|
75
|
+
@logger.debug 'OK'
|
76
|
+
end
|
58
77
|
}
|
59
78
|
return document
|
60
79
|
end
|
@@ -75,7 +94,7 @@ module RDig
|
|
75
94
|
# nil otherwise
|
76
95
|
def apply(document)
|
77
96
|
synchronize do
|
78
|
-
@visited_urls.add?(document.uri.to_s) ? document : nil
|
97
|
+
@visited_urls.add?(document.uri.to_s) ? document : nil
|
79
98
|
end
|
80
99
|
end
|
81
100
|
end
|
@@ -174,7 +193,7 @@ module RDig
|
|
174
193
|
uri.host = ref.host unless uri.host
|
175
194
|
uri.port = ref.port unless uri.port || ref.port==ref.default_port
|
176
195
|
uri.path = ref.path unless uri.path
|
177
|
-
|
196
|
+
|
178
197
|
old_uri_path = uri.path
|
179
198
|
if uri.path !~ /^\// || uri.path =~ /^\.\./
|
180
199
|
ref_path = ref.path || '/'
|
@@ -202,7 +221,7 @@ module RDig
|
|
202
221
|
if document.uri.path =~ /\/$/
|
203
222
|
# append index document if configured
|
204
223
|
if cfg.index_document
|
205
|
-
document.uri.path <<
|
224
|
+
document.uri.path << cfg.index_document
|
206
225
|
elsif cfg.remove_trailing_slash
|
207
226
|
document.uri.path.gsub! /\/$/, ''
|
208
227
|
end
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 3
|
8
|
-
-
|
9
|
-
version: 0.3.
|
8
|
+
- 10
|
9
|
+
version: 0.3.10
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Jens Kraemer
|
@@ -67,7 +67,7 @@ extensions: []
|
|
67
67
|
extra_rdoc_files:
|
68
68
|
- History.txt
|
69
69
|
- Manifest.txt
|
70
|
-
- README
|
70
|
+
- README.rdoc
|
71
71
|
files:
|
72
72
|
- CHANGES
|
73
73
|
- History.txt
|
@@ -75,7 +75,7 @@ files:
|
|
75
75
|
- LICENSE
|
76
76
|
- Manifest.txt
|
77
77
|
- rakefile
|
78
|
-
- README
|
78
|
+
- README.rdoc
|
79
79
|
- bin/rdig
|
80
80
|
- doc/examples/config.rb
|
81
81
|
- lib/rdig/content_extractors/doc.rb
|