rdig 0.3.9 → 0.3.10
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +1 -1
- data/{README → README.rdoc} +1 -1
- data/bin/rdig +0 -0
- data/lib/rdig.rb +6 -7
- data/lib/rdig/content_extractors.rb +4 -4
- data/lib/rdig/content_extractors/pdf.rb +3 -3
- data/lib/rdig/crawler.rb +17 -14
- data/lib/rdig/documents.rb +8 -8
- data/lib/rdig/index.rb +9 -5
- data/lib/rdig/search.rb +19 -9
- data/lib/rdig/url_filters.rb +26 -7
- metadata +4 -4
data/LICENSE
CHANGED
data/{README → README.rdoc}
RENAMED
@@ -25,7 +25,7 @@ manually do a +gem install rubyful_soup+.
|
|
25
25
|
=== Handle search in your application:
|
26
26
|
require 'rdig'
|
27
27
|
require 'rdig_config' # load your config file here
|
28
|
-
search_results = RDig.searcher.search(query
|
28
|
+
search_results = RDig.searcher.search(query)
|
29
29
|
|
30
30
|
see RDig::Search::Searcher for more information.
|
31
31
|
|
data/bin/rdig
CHANGED
File without changes
|
data/lib/rdig.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
3
|
#--
|
4
|
-
# Copyright (c) 2006 Jens Kraemer
|
4
|
+
# Copyright (c) 2006-2012 Jens Kraemer, jk@jkraemer.net
|
5
5
|
#
|
6
6
|
# Permission is hereby granted, free of charge, to any person obtaining
|
7
7
|
# a copy of this software and associated documentation files (the
|
@@ -84,7 +84,7 @@ module RDig
|
|
84
84
|
{ RDig::UrlFilters::PathExclusionFilter => :exclude_documents }
|
85
85
|
]
|
86
86
|
}
|
87
|
-
|
87
|
+
|
88
88
|
end
|
89
89
|
|
90
90
|
def application
|
@@ -150,7 +150,7 @@ module RDig
|
|
150
150
|
end
|
151
151
|
end
|
152
152
|
alias config configuration
|
153
|
-
|
153
|
+
|
154
154
|
def logger
|
155
155
|
@logger ||= create_logger
|
156
156
|
end
|
@@ -200,12 +200,12 @@ module RDig
|
|
200
200
|
def options
|
201
201
|
@options ||= OpenStruct.new
|
202
202
|
end
|
203
|
-
|
203
|
+
|
204
204
|
# Display the program usage line.
|
205
205
|
def usage
|
206
206
|
puts "rdig -c configfile {options}"
|
207
207
|
end
|
208
|
-
|
208
|
+
|
209
209
|
# Display the rake command line help.
|
210
210
|
def help
|
211
211
|
usage
|
@@ -266,8 +266,7 @@ module RDig
|
|
266
266
|
rescue
|
267
267
|
puts $!.backtrace
|
268
268
|
fail "No Configfile found!\n#{$!}"
|
269
|
-
|
270
|
-
end
|
269
|
+
end
|
271
270
|
|
272
271
|
puts "using Ferret #{Ferret::VERSION}"
|
273
272
|
|
@@ -1,5 +1,5 @@
|
|
1
1
|
module RDig
|
2
|
-
|
2
|
+
|
3
3
|
# Contains classes which are used for extracting content and meta data from
|
4
4
|
# various content types.
|
5
5
|
module ContentExtractors
|
@@ -13,7 +13,7 @@ module RDig
|
|
13
13
|
# Extractors inheriting from this class will be auto-discovered and used
|
14
14
|
# when can_do returns true
|
15
15
|
class ContentExtractor
|
16
|
-
|
16
|
+
|
17
17
|
def self.inherited(extractor)
|
18
18
|
super(extractor)
|
19
19
|
self.extractors << extractor
|
@@ -32,7 +32,7 @@ module RDig
|
|
32
32
|
ex
|
33
33
|
}.compact
|
34
34
|
end
|
35
|
-
|
35
|
+
|
36
36
|
def self.process(content, content_type)
|
37
37
|
self.extractor_instances.each { |extractor|
|
38
38
|
return extractor.process(content) if extractor.can_do(content_type)
|
@@ -61,7 +61,7 @@ module RDig
|
|
61
61
|
end
|
62
62
|
result
|
63
63
|
end
|
64
|
-
|
64
|
+
|
65
65
|
def as_file(content)
|
66
66
|
file = Tempfile.new('rdig')
|
67
67
|
file << content
|
@@ -8,7 +8,7 @@ module RDig
|
|
8
8
|
#
|
9
9
|
class PdfContentExtractor < ContentExtractor
|
10
10
|
include ExternalAppHelper
|
11
|
-
|
11
|
+
|
12
12
|
def initialize(config)
|
13
13
|
super(config)
|
14
14
|
@pattern = /^application\/pdf/
|
@@ -22,7 +22,7 @@ module RDig
|
|
22
22
|
end
|
23
23
|
}
|
24
24
|
end
|
25
|
-
|
25
|
+
|
26
26
|
def process(content)
|
27
27
|
result = {}
|
28
28
|
as_file(content) do |file|
|
@@ -35,7 +35,7 @@ module RDig
|
|
35
35
|
def get_content(path_to_tempfile)
|
36
36
|
%x{#{@pdftotext} -enc UTF-8 '#{path_to_tempfile}' -}
|
37
37
|
end
|
38
|
-
|
38
|
+
|
39
39
|
# extracts the title from pdf meta data
|
40
40
|
# needs pdfinfo
|
41
41
|
# returns the title or nil if no title was found
|
data/lib/rdig/crawler.rb
CHANGED
@@ -1,12 +1,11 @@
|
|
1
1
|
module RDig
|
2
|
-
|
3
|
-
|
4
2
|
class Crawler
|
5
|
-
|
3
|
+
|
6
4
|
def initialize(config = RDig.config, logger = RDig.logger)
|
7
5
|
@documents = Queue.new
|
8
6
|
@logger = logger
|
9
7
|
@config = config
|
8
|
+
@indexed_documents = 0
|
10
9
|
end
|
11
10
|
|
12
11
|
def run
|
@@ -22,6 +21,7 @@ module RDig
|
|
22
21
|
url_type = @config.crawler.start_urls.first =~ /^file:\/\// ? :file : :http
|
23
22
|
chain_config = RDig.filter_chain[url_type]
|
24
23
|
|
24
|
+
# the etag filter operates on the fetched document, thats why we cannot put it into the filter chain right now.
|
25
25
|
@etag_filter = ETagFilter.new
|
26
26
|
filterchain = UrlFilters::FilterChain.new(chain_config)
|
27
27
|
@config.crawler.start_urls.each { |url| add_url(url, filterchain) }
|
@@ -31,9 +31,11 @@ module RDig
|
|
31
31
|
num_threads.times { |i|
|
32
32
|
group.join_nowait Thread.new("fetcher #{i}") {
|
33
33
|
filterchain = UrlFilters::FilterChain.new(chain_config)
|
34
|
+
@logger.info "thread #{i} running..."
|
34
35
|
while (doc = @documents.pop) != :exit
|
35
36
|
process_document doc, filterchain
|
36
37
|
end
|
38
|
+
@logger.info "thread #{i} is done."
|
37
39
|
}
|
38
40
|
}
|
39
41
|
|
@@ -47,20 +49,21 @@ module RDig
|
|
47
49
|
|
48
50
|
@logger.info "waiting for threads to finish..."
|
49
51
|
group.all_waits
|
52
|
+
@logger.info "indexed #{@indexer.indexed_documents} documents"
|
50
53
|
end
|
51
54
|
|
52
55
|
def process_document(doc, filterchain)
|
53
|
-
@logger.
|
56
|
+
@logger.info "processing document #{doc.uri}"
|
54
57
|
doc.fetch
|
55
58
|
case doc.status
|
56
59
|
when :success
|
57
|
-
if @etag_filter.apply(doc)
|
60
|
+
if @etag_filter.apply(doc)
|
58
61
|
# add links from this document to the queue
|
59
62
|
doc.content[:links].each { |url|
|
60
63
|
add_url(url, filterchain, doc)
|
61
64
|
} unless doc.content[:links].nil?
|
62
65
|
add_to_index doc
|
63
|
-
end
|
66
|
+
end
|
64
67
|
when :redirect
|
65
68
|
@logger.debug "redirect to #{doc.content}"
|
66
69
|
add_url(doc.content, filterchain, doc)
|
@@ -69,14 +72,16 @@ module RDig
|
|
69
72
|
end
|
70
73
|
rescue
|
71
74
|
@logger.error "error processing document #{doc.uri.to_s}: #{$!}"
|
72
|
-
@logger.
|
75
|
+
@logger.info "Trace: #{$!.backtrace.join("\n")}"
|
73
76
|
end
|
74
77
|
|
75
78
|
def add_to_index(doc)
|
76
|
-
|
79
|
+
if doc.needs_indexing?
|
80
|
+
@indexer << doc
|
81
|
+
end
|
77
82
|
end
|
78
83
|
|
79
|
-
|
84
|
+
|
80
85
|
# pipes a new document pointing to url through the filter chain,
|
81
86
|
# if it survives that, it gets added to the documents queue for further
|
82
87
|
# processing
|
@@ -90,19 +95,17 @@ module RDig
|
|
90
95
|
Document.create(url)
|
91
96
|
end
|
92
97
|
|
93
|
-
doc = filterchain.apply(doc)
|
94
|
-
|
95
|
-
if doc
|
98
|
+
if doc = filterchain.apply(doc)
|
96
99
|
@documents << doc
|
97
100
|
@logger.debug "url #{url} survived filterchain"
|
98
101
|
end
|
99
102
|
rescue
|
100
103
|
nil
|
101
104
|
end
|
102
|
-
|
105
|
+
|
103
106
|
end
|
104
107
|
|
105
|
-
|
108
|
+
|
106
109
|
# checks fetched documents' E-Tag headers against the list of E-Tags
|
107
110
|
# of the documents already indexed.
|
108
111
|
# This is supposed to help against double-indexing documents which can
|
data/lib/rdig/documents.rb
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
module RDig
|
2
|
-
|
2
|
+
|
3
3
|
#
|
4
4
|
# Document base class
|
5
5
|
#
|
6
6
|
class Document
|
7
|
-
|
7
|
+
|
8
8
|
attr_reader :uri
|
9
9
|
attr_reader :content
|
10
10
|
attr_reader :content_type
|
11
|
-
|
11
|
+
|
12
12
|
def self.create(url)
|
13
13
|
return case url
|
14
14
|
when /^https?:\/\//i
|
@@ -32,7 +32,7 @@ module RDig
|
|
32
32
|
def title; @content[:title] end
|
33
33
|
def body; @content[:content] end
|
34
34
|
def links; @content[:links] end
|
35
|
-
|
35
|
+
|
36
36
|
def needs_indexing?
|
37
37
|
has_content? && (title || body)
|
38
38
|
end
|
@@ -47,7 +47,7 @@ module RDig
|
|
47
47
|
|
48
48
|
end
|
49
49
|
|
50
|
-
|
50
|
+
|
51
51
|
#
|
52
52
|
# Document in a File system
|
53
53
|
#
|
@@ -90,8 +90,8 @@ module RDig
|
|
90
90
|
end
|
91
91
|
|
92
92
|
end
|
93
|
-
|
94
|
-
|
93
|
+
|
94
|
+
|
95
95
|
#
|
96
96
|
# Remote Document to be retrieved by HTTP
|
97
97
|
#
|
@@ -106,7 +106,7 @@ module RDig
|
|
106
106
|
def create_child(uri)
|
107
107
|
HttpDocument.new(:uri => uri, :referrer => self.uri, :depth => self.depth+1) unless uri =~ /^file:\/\//i
|
108
108
|
end
|
109
|
-
|
109
|
+
|
110
110
|
# url: url of this document, may be relative to the referring doc or host.
|
111
111
|
# referrer: uri of the document we retrieved this link from
|
112
112
|
def initialize(args={})
|
data/lib/rdig/index.rb
CHANGED
@@ -1,11 +1,14 @@
|
|
1
1
|
module RDig
|
2
2
|
module Index
|
3
|
-
|
3
|
+
|
4
4
|
# used by the crawler to build the ferret index
|
5
5
|
class Indexer
|
6
6
|
include MonitorMixin
|
7
|
-
|
7
|
+
|
8
|
+
attr_reader :indexed_documents
|
9
|
+
|
8
10
|
def initialize(settings)
|
11
|
+
@indexed_documents = 0
|
9
12
|
@config = settings
|
10
13
|
@index_writer = Ferret::Index::IndexWriter.new(
|
11
14
|
:path => settings.path,
|
@@ -13,7 +16,7 @@ module RDig
|
|
13
16
|
:analyzer => settings.analyzer)
|
14
17
|
super() # scary, MonitorMixin won't initialize if we don't call super() here (parens matter)
|
15
18
|
end
|
16
|
-
|
19
|
+
|
17
20
|
def add_to_index(document)
|
18
21
|
RDig.logger.debug "add to index: #{document.uri.to_s}"
|
19
22
|
@config.rewrite_uri.call(document.uri) if @config.rewrite_uri
|
@@ -25,16 +28,17 @@ module RDig
|
|
25
28
|
}
|
26
29
|
synchronize do
|
27
30
|
@index_writer << doc
|
31
|
+
@indexed_documents += 1
|
28
32
|
end
|
29
33
|
end
|
30
34
|
alias :<< :add_to_index
|
31
|
-
|
35
|
+
|
32
36
|
def close
|
33
37
|
@index_writer.optimize
|
34
38
|
@index_writer.close
|
35
39
|
@index_writer = nil
|
36
40
|
end
|
37
41
|
end
|
38
|
-
|
42
|
+
|
39
43
|
end
|
40
44
|
end
|
data/lib/rdig/search.rb
CHANGED
@@ -5,17 +5,17 @@ module RDig
|
|
5
5
|
# Call RDig::searcher to retrieve an instance ready for use.
|
6
6
|
class Searcher
|
7
7
|
include Ferret::Search
|
8
|
-
|
8
|
+
|
9
9
|
# the query parser used to parse query strings
|
10
10
|
attr_reader :query_parser
|
11
|
-
|
11
|
+
|
12
12
|
# takes the ferret section of the rdig configuration as a parameter.
|
13
13
|
def initialize(settings)
|
14
14
|
@ferret_config = settings
|
15
15
|
@query_parser = Ferret::QueryParser.new(settings.marshal_dump)
|
16
16
|
ferret_searcher
|
17
17
|
end
|
18
|
-
|
18
|
+
|
19
19
|
# returns the Ferret::Search::IndexSearcher instance used internally.
|
20
20
|
def ferret_searcher
|
21
21
|
if @ferret_searcher and !@ferret_searcher.reader.latest?
|
@@ -29,7 +29,14 @@ module RDig
|
|
29
29
|
end
|
30
30
|
@ferret_searcher
|
31
31
|
end
|
32
|
-
|
32
|
+
|
33
|
+
def get_maximum_score(query, options)
|
34
|
+
ferret_searcher.search_each(query, options.merge(:limit => 1, :offset => 0)) do |doc_id, score|
|
35
|
+
return score
|
36
|
+
end
|
37
|
+
0
|
38
|
+
end
|
39
|
+
|
33
40
|
# run a search.
|
34
41
|
# +query+ usually will be a user-entered string. See the Ferret query
|
35
42
|
# language[http://ferret.davebalmain.com/api/classes/Ferret/QueryParser.html]
|
@@ -46,23 +53,26 @@ module RDig
|
|
46
53
|
RDig.logger.info "Query: #{query}"
|
47
54
|
results = []
|
48
55
|
searcher = ferret_searcher
|
56
|
+
maximum_score = get_maximum_score query, options
|
49
57
|
result[:hitcount] = searcher.search_each(query, options) do |doc_id, score|
|
50
58
|
doc = searcher[doc_id]
|
51
59
|
results << { :score => score,
|
52
60
|
:title => doc[:title],
|
53
61
|
:url => doc[:url],
|
54
|
-
:extract => build_extract(doc[:data])
|
62
|
+
:extract => build_extract(doc[:data]),
|
63
|
+
:relative_score => (score / maximum_score)
|
64
|
+
}
|
55
65
|
end
|
56
66
|
result[:list] = results
|
57
67
|
result
|
58
68
|
end
|
59
|
-
|
69
|
+
|
60
70
|
def build_extract(data)
|
61
71
|
(data && data.length > 200) ? data[0..200] : data
|
62
72
|
end
|
63
|
-
|
73
|
+
|
64
74
|
end
|
65
|
-
|
75
|
+
|
66
76
|
# class SearchResult < OpenStruct
|
67
77
|
# def initialize(doc, score)
|
68
78
|
# self.score = score
|
@@ -72,6 +82,6 @@ module RDig
|
|
72
82
|
# end
|
73
83
|
# end
|
74
84
|
|
75
|
-
|
85
|
+
|
76
86
|
end
|
77
87
|
end
|
data/lib/rdig/url_filters.rb
CHANGED
@@ -1,9 +1,10 @@
|
|
1
1
|
module RDig
|
2
|
-
|
2
|
+
|
3
3
|
module UrlFilters
|
4
4
|
|
5
5
|
class FilterChain
|
6
6
|
def initialize(chain_config)
|
7
|
+
@logger = RDig.logger
|
7
8
|
@filters = []
|
8
9
|
chain_config.each { |filter|
|
9
10
|
case filter
|
@@ -29,11 +30,23 @@ module RDig
|
|
29
30
|
when Symbol
|
30
31
|
if args.nil?
|
31
32
|
@filters << lambda { |document|
|
32
|
-
|
33
|
+
begin
|
34
|
+
UrlFilters.send(filter, document)
|
35
|
+
rescue Exception
|
36
|
+
@logger.error "error in URL filter #{filter}: #{$!}"
|
37
|
+
@logger.error $!.backtrace.join("\n")
|
38
|
+
nil
|
39
|
+
end
|
33
40
|
}
|
34
41
|
else
|
35
42
|
@filters << lambda { |document|
|
36
|
-
|
43
|
+
begin
|
44
|
+
UrlFilters.send(filter, document, args)
|
45
|
+
rescue Exception
|
46
|
+
@logger.error "error in URL filter #{filter}: #{$!}"
|
47
|
+
@logger.error $!.backtrace.join("\n")
|
48
|
+
nil
|
49
|
+
end
|
37
50
|
}
|
38
51
|
end
|
39
52
|
when Class
|
@@ -54,7 +67,13 @@ module RDig
|
|
54
67
|
|
55
68
|
def apply(document)
|
56
69
|
@filters.each { |filter|
|
57
|
-
|
70
|
+
@logger.debug "running filter #{filter.inspect} on doc #{document.uri}"
|
71
|
+
unless filter.call(document)
|
72
|
+
@logger.debug "fail"
|
73
|
+
return nil
|
74
|
+
else
|
75
|
+
@logger.debug 'OK'
|
76
|
+
end
|
58
77
|
}
|
59
78
|
return document
|
60
79
|
end
|
@@ -75,7 +94,7 @@ module RDig
|
|
75
94
|
# nil otherwise
|
76
95
|
def apply(document)
|
77
96
|
synchronize do
|
78
|
-
@visited_urls.add?(document.uri.to_s) ? document : nil
|
97
|
+
@visited_urls.add?(document.uri.to_s) ? document : nil
|
79
98
|
end
|
80
99
|
end
|
81
100
|
end
|
@@ -174,7 +193,7 @@ module RDig
|
|
174
193
|
uri.host = ref.host unless uri.host
|
175
194
|
uri.port = ref.port unless uri.port || ref.port==ref.default_port
|
176
195
|
uri.path = ref.path unless uri.path
|
177
|
-
|
196
|
+
|
178
197
|
old_uri_path = uri.path
|
179
198
|
if uri.path !~ /^\// || uri.path =~ /^\.\./
|
180
199
|
ref_path = ref.path || '/'
|
@@ -202,7 +221,7 @@ module RDig
|
|
202
221
|
if document.uri.path =~ /\/$/
|
203
222
|
# append index document if configured
|
204
223
|
if cfg.index_document
|
205
|
-
document.uri.path <<
|
224
|
+
document.uri.path << cfg.index_document
|
206
225
|
elsif cfg.remove_trailing_slash
|
207
226
|
document.uri.path.gsub! /\/$/, ''
|
208
227
|
end
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 3
|
8
|
-
-
|
9
|
-
version: 0.3.
|
8
|
+
- 10
|
9
|
+
version: 0.3.10
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Jens Kraemer
|
@@ -67,7 +67,7 @@ extensions: []
|
|
67
67
|
extra_rdoc_files:
|
68
68
|
- History.txt
|
69
69
|
- Manifest.txt
|
70
|
-
- README
|
70
|
+
- README.rdoc
|
71
71
|
files:
|
72
72
|
- CHANGES
|
73
73
|
- History.txt
|
@@ -75,7 +75,7 @@ files:
|
|
75
75
|
- LICENSE
|
76
76
|
- Manifest.txt
|
77
77
|
- rakefile
|
78
|
-
- README
|
78
|
+
- README.rdoc
|
79
79
|
- bin/rdig
|
80
80
|
- doc/examples/config.rb
|
81
81
|
- lib/rdig/content_extractors/doc.rb
|