rdig 0.3.9 → 0.3.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/LICENSE CHANGED
@@ -1,4 +1,4 @@
1
- Copyright (c) 2006 Jens Kraemer
1
+ Copyright (c) 2006-2012 Jens Kraemer, jk@jkraemer.net
2
2
 
3
3
  Permission is hereby granted, free of charge, to any person obtaining
4
4
  a copy of this software and associated documentation files (the
@@ -25,7 +25,7 @@ manually do a +gem install rubyful_soup+.
25
25
  === Handle search in your application:
26
26
  require 'rdig'
27
27
  require 'rdig_config' # load your config file here
28
- search_results = RDig.searcher.search(query, options={})
28
+ search_results = RDig.searcher.search(query)
29
29
 
30
30
  see RDig::Search::Searcher for more information.
31
31
 
data/bin/rdig CHANGED
File without changes
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
3
  #--
4
- # Copyright (c) 2006 Jens Kraemer
4
+ # Copyright (c) 2006-2012 Jens Kraemer, jk@jkraemer.net
5
5
  #
6
6
  # Permission is hereby granted, free of charge, to any person obtaining
7
7
  # a copy of this software and associated documentation files (the
@@ -84,7 +84,7 @@ module RDig
84
84
  { RDig::UrlFilters::PathExclusionFilter => :exclude_documents }
85
85
  ]
86
86
  }
87
-
87
+
88
88
  end
89
89
 
90
90
  def application
@@ -150,7 +150,7 @@ module RDig
150
150
  end
151
151
  end
152
152
  alias config configuration
153
-
153
+
154
154
  def logger
155
155
  @logger ||= create_logger
156
156
  end
@@ -200,12 +200,12 @@ module RDig
200
200
  def options
201
201
  @options ||= OpenStruct.new
202
202
  end
203
-
203
+
204
204
  # Display the program usage line.
205
205
  def usage
206
206
  puts "rdig -c configfile {options}"
207
207
  end
208
-
208
+
209
209
  # Display the rake command line help.
210
210
  def help
211
211
  usage
@@ -266,8 +266,7 @@ module RDig
266
266
  rescue
267
267
  puts $!.backtrace
268
268
  fail "No Configfile found!\n#{$!}"
269
-
270
- end
269
+ end
271
270
 
272
271
  puts "using Ferret #{Ferret::VERSION}"
273
272
 
@@ -1,5 +1,5 @@
1
1
  module RDig
2
-
2
+
3
3
  # Contains classes which are used for extracting content and meta data from
4
4
  # various content types.
5
5
  module ContentExtractors
@@ -13,7 +13,7 @@ module RDig
13
13
  # Extractors inheriting from this class will be auto-discovered and used
14
14
  # when can_do returns true
15
15
  class ContentExtractor
16
-
16
+
17
17
  def self.inherited(extractor)
18
18
  super(extractor)
19
19
  self.extractors << extractor
@@ -32,7 +32,7 @@ module RDig
32
32
  ex
33
33
  }.compact
34
34
  end
35
-
35
+
36
36
  def self.process(content, content_type)
37
37
  self.extractor_instances.each { |extractor|
38
38
  return extractor.process(content) if extractor.can_do(content_type)
@@ -61,7 +61,7 @@ module RDig
61
61
  end
62
62
  result
63
63
  end
64
-
64
+
65
65
  def as_file(content)
66
66
  file = Tempfile.new('rdig')
67
67
  file << content
@@ -8,7 +8,7 @@ module RDig
8
8
  #
9
9
  class PdfContentExtractor < ContentExtractor
10
10
  include ExternalAppHelper
11
-
11
+
12
12
  def initialize(config)
13
13
  super(config)
14
14
  @pattern = /^application\/pdf/
@@ -22,7 +22,7 @@ module RDig
22
22
  end
23
23
  }
24
24
  end
25
-
25
+
26
26
  def process(content)
27
27
  result = {}
28
28
  as_file(content) do |file|
@@ -35,7 +35,7 @@ module RDig
35
35
  def get_content(path_to_tempfile)
36
36
  %x{#{@pdftotext} -enc UTF-8 '#{path_to_tempfile}' -}
37
37
  end
38
-
38
+
39
39
  # extracts the title from pdf meta data
40
40
  # needs pdfinfo
41
41
  # returns the title or nil if no title was found
@@ -1,12 +1,11 @@
1
1
  module RDig
2
-
3
-
4
2
  class Crawler
5
-
3
+
6
4
  def initialize(config = RDig.config, logger = RDig.logger)
7
5
  @documents = Queue.new
8
6
  @logger = logger
9
7
  @config = config
8
+ @indexed_documents = 0
10
9
  end
11
10
 
12
11
  def run
@@ -22,6 +21,7 @@ module RDig
22
21
  url_type = @config.crawler.start_urls.first =~ /^file:\/\// ? :file : :http
23
22
  chain_config = RDig.filter_chain[url_type]
24
23
 
24
+ # the etag filter operates on the fetched document, thats why we cannot put it into the filter chain right now.
25
25
  @etag_filter = ETagFilter.new
26
26
  filterchain = UrlFilters::FilterChain.new(chain_config)
27
27
  @config.crawler.start_urls.each { |url| add_url(url, filterchain) }
@@ -31,9 +31,11 @@ module RDig
31
31
  num_threads.times { |i|
32
32
  group.join_nowait Thread.new("fetcher #{i}") {
33
33
  filterchain = UrlFilters::FilterChain.new(chain_config)
34
+ @logger.info "thread #{i} running..."
34
35
  while (doc = @documents.pop) != :exit
35
36
  process_document doc, filterchain
36
37
  end
38
+ @logger.info "thread #{i} is done."
37
39
  }
38
40
  }
39
41
 
@@ -47,20 +49,21 @@ module RDig
47
49
 
48
50
  @logger.info "waiting for threads to finish..."
49
51
  group.all_waits
52
+ @logger.info "indexed #{@indexer.indexed_documents} documents"
50
53
  end
51
54
 
52
55
  def process_document(doc, filterchain)
53
- @logger.debug "processing document #{doc}"
56
+ @logger.info "processing document #{doc.uri}"
54
57
  doc.fetch
55
58
  case doc.status
56
59
  when :success
57
- if @etag_filter.apply(doc)
60
+ if @etag_filter.apply(doc)
58
61
  # add links from this document to the queue
59
62
  doc.content[:links].each { |url|
60
63
  add_url(url, filterchain, doc)
61
64
  } unless doc.content[:links].nil?
62
65
  add_to_index doc
63
- end
66
+ end
64
67
  when :redirect
65
68
  @logger.debug "redirect to #{doc.content}"
66
69
  add_url(doc.content, filterchain, doc)
@@ -69,14 +72,16 @@ module RDig
69
72
  end
70
73
  rescue
71
74
  @logger.error "error processing document #{doc.uri.to_s}: #{$!}"
72
- @logger.debug "Trace: #{$!.backtrace.join("\n")}"
75
+ @logger.info "Trace: #{$!.backtrace.join("\n")}"
73
76
  end
74
77
 
75
78
  def add_to_index(doc)
76
- @indexer << doc if doc.needs_indexing?
79
+ if doc.needs_indexing?
80
+ @indexer << doc
81
+ end
77
82
  end
78
83
 
79
-
84
+
80
85
  # pipes a new document pointing to url through the filter chain,
81
86
  # if it survives that, it gets added to the documents queue for further
82
87
  # processing
@@ -90,19 +95,17 @@ module RDig
90
95
  Document.create(url)
91
96
  end
92
97
 
93
- doc = filterchain.apply(doc)
94
-
95
- if doc
98
+ if doc = filterchain.apply(doc)
96
99
  @documents << doc
97
100
  @logger.debug "url #{url} survived filterchain"
98
101
  end
99
102
  rescue
100
103
  nil
101
104
  end
102
-
105
+
103
106
  end
104
107
 
105
-
108
+
106
109
  # checks fetched documents' E-Tag headers against the list of E-Tags
107
110
  # of the documents already indexed.
108
111
  # This is supposed to help against double-indexing documents which can
@@ -1,14 +1,14 @@
1
1
  module RDig
2
-
2
+
3
3
  #
4
4
  # Document base class
5
5
  #
6
6
  class Document
7
-
7
+
8
8
  attr_reader :uri
9
9
  attr_reader :content
10
10
  attr_reader :content_type
11
-
11
+
12
12
  def self.create(url)
13
13
  return case url
14
14
  when /^https?:\/\//i
@@ -32,7 +32,7 @@ module RDig
32
32
  def title; @content[:title] end
33
33
  def body; @content[:content] end
34
34
  def links; @content[:links] end
35
-
35
+
36
36
  def needs_indexing?
37
37
  has_content? && (title || body)
38
38
  end
@@ -47,7 +47,7 @@ module RDig
47
47
 
48
48
  end
49
49
 
50
-
50
+
51
51
  #
52
52
  # Document in a File system
53
53
  #
@@ -90,8 +90,8 @@ module RDig
90
90
  end
91
91
 
92
92
  end
93
-
94
-
93
+
94
+
95
95
  #
96
96
  # Remote Document to be retrieved by HTTP
97
97
  #
@@ -106,7 +106,7 @@ module RDig
106
106
  def create_child(uri)
107
107
  HttpDocument.new(:uri => uri, :referrer => self.uri, :depth => self.depth+1) unless uri =~ /^file:\/\//i
108
108
  end
109
-
109
+
110
110
  # url: url of this document, may be relative to the referring doc or host.
111
111
  # referrer: uri of the document we retrieved this link from
112
112
  def initialize(args={})
@@ -1,11 +1,14 @@
1
1
  module RDig
2
2
  module Index
3
-
3
+
4
4
  # used by the crawler to build the ferret index
5
5
  class Indexer
6
6
  include MonitorMixin
7
-
7
+
8
+ attr_reader :indexed_documents
9
+
8
10
  def initialize(settings)
11
+ @indexed_documents = 0
9
12
  @config = settings
10
13
  @index_writer = Ferret::Index::IndexWriter.new(
11
14
  :path => settings.path,
@@ -13,7 +16,7 @@ module RDig
13
16
  :analyzer => settings.analyzer)
14
17
  super() # scary, MonitorMixin won't initialize if we don't call super() here (parens matter)
15
18
  end
16
-
19
+
17
20
  def add_to_index(document)
18
21
  RDig.logger.debug "add to index: #{document.uri.to_s}"
19
22
  @config.rewrite_uri.call(document.uri) if @config.rewrite_uri
@@ -25,16 +28,17 @@ module RDig
25
28
  }
26
29
  synchronize do
27
30
  @index_writer << doc
31
+ @indexed_documents += 1
28
32
  end
29
33
  end
30
34
  alias :<< :add_to_index
31
-
35
+
32
36
  def close
33
37
  @index_writer.optimize
34
38
  @index_writer.close
35
39
  @index_writer = nil
36
40
  end
37
41
  end
38
-
42
+
39
43
  end
40
44
  end
@@ -5,17 +5,17 @@ module RDig
5
5
  # Call RDig::searcher to retrieve an instance ready for use.
6
6
  class Searcher
7
7
  include Ferret::Search
8
-
8
+
9
9
  # the query parser used to parse query strings
10
10
  attr_reader :query_parser
11
-
11
+
12
12
  # takes the ferret section of the rdig configuration as a parameter.
13
13
  def initialize(settings)
14
14
  @ferret_config = settings
15
15
  @query_parser = Ferret::QueryParser.new(settings.marshal_dump)
16
16
  ferret_searcher
17
17
  end
18
-
18
+
19
19
  # returns the Ferret::Search::IndexSearcher instance used internally.
20
20
  def ferret_searcher
21
21
  if @ferret_searcher and !@ferret_searcher.reader.latest?
@@ -29,7 +29,14 @@ module RDig
29
29
  end
30
30
  @ferret_searcher
31
31
  end
32
-
32
+
33
+ def get_maximum_score(query, options)
34
+ ferret_searcher.search_each(query, options.merge(:limit => 1, :offset => 0)) do |doc_id, score|
35
+ return score
36
+ end
37
+ 0
38
+ end
39
+
33
40
  # run a search.
34
41
  # +query+ usually will be a user-entered string. See the Ferret query
35
42
  # language[http://ferret.davebalmain.com/api/classes/Ferret/QueryParser.html]
@@ -46,23 +53,26 @@ module RDig
46
53
  RDig.logger.info "Query: #{query}"
47
54
  results = []
48
55
  searcher = ferret_searcher
56
+ maximum_score = get_maximum_score query, options
49
57
  result[:hitcount] = searcher.search_each(query, options) do |doc_id, score|
50
58
  doc = searcher[doc_id]
51
59
  results << { :score => score,
52
60
  :title => doc[:title],
53
61
  :url => doc[:url],
54
- :extract => build_extract(doc[:data]) }
62
+ :extract => build_extract(doc[:data]),
63
+ :relative_score => (score / maximum_score)
64
+ }
55
65
  end
56
66
  result[:list] = results
57
67
  result
58
68
  end
59
-
69
+
60
70
  def build_extract(data)
61
71
  (data && data.length > 200) ? data[0..200] : data
62
72
  end
63
-
73
+
64
74
  end
65
-
75
+
66
76
  # class SearchResult < OpenStruct
67
77
  # def initialize(doc, score)
68
78
  # self.score = score
@@ -72,6 +82,6 @@ module RDig
72
82
  # end
73
83
  # end
74
84
 
75
-
85
+
76
86
  end
77
87
  end
@@ -1,9 +1,10 @@
1
1
  module RDig
2
-
2
+
3
3
  module UrlFilters
4
4
 
5
5
  class FilterChain
6
6
  def initialize(chain_config)
7
+ @logger = RDig.logger
7
8
  @filters = []
8
9
  chain_config.each { |filter|
9
10
  case filter
@@ -29,11 +30,23 @@ module RDig
29
30
  when Symbol
30
31
  if args.nil?
31
32
  @filters << lambda { |document|
32
- UrlFilters.send(filter, document)
33
+ begin
34
+ UrlFilters.send(filter, document)
35
+ rescue Exception
36
+ @logger.error "error in URL filter #{filter}: #{$!}"
37
+ @logger.error $!.backtrace.join("\n")
38
+ nil
39
+ end
33
40
  }
34
41
  else
35
42
  @filters << lambda { |document|
36
- UrlFilters.send(filter, document, args)
43
+ begin
44
+ UrlFilters.send(filter, document, args)
45
+ rescue Exception
46
+ @logger.error "error in URL filter #{filter}: #{$!}"
47
+ @logger.error $!.backtrace.join("\n")
48
+ nil
49
+ end
37
50
  }
38
51
  end
39
52
  when Class
@@ -54,7 +67,13 @@ module RDig
54
67
 
55
68
  def apply(document)
56
69
  @filters.each { |filter|
57
- return nil unless filter.call(document)
70
+ @logger.debug "running filter #{filter.inspect} on doc #{document.uri}"
71
+ unless filter.call(document)
72
+ @logger.debug "fail"
73
+ return nil
74
+ else
75
+ @logger.debug 'OK'
76
+ end
58
77
  }
59
78
  return document
60
79
  end
@@ -75,7 +94,7 @@ module RDig
75
94
  # nil otherwise
76
95
  def apply(document)
77
96
  synchronize do
78
- @visited_urls.add?(document.uri.to_s) ? document : nil
97
+ @visited_urls.add?(document.uri.to_s) ? document : nil
79
98
  end
80
99
  end
81
100
  end
@@ -174,7 +193,7 @@ module RDig
174
193
  uri.host = ref.host unless uri.host
175
194
  uri.port = ref.port unless uri.port || ref.port==ref.default_port
176
195
  uri.path = ref.path unless uri.path
177
-
196
+
178
197
  old_uri_path = uri.path
179
198
  if uri.path !~ /^\// || uri.path =~ /^\.\./
180
199
  ref_path = ref.path || '/'
@@ -202,7 +221,7 @@ module RDig
202
221
  if document.uri.path =~ /\/$/
203
222
  # append index document if configured
204
223
  if cfg.index_document
205
- document.uri.path << RDig.config.index_document
224
+ document.uri.path << cfg.index_document
206
225
  elsif cfg.remove_trailing_slash
207
226
  document.uri.path.gsub! /\/$/, ''
208
227
  end
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 3
8
- - 9
9
- version: 0.3.9
8
+ - 10
9
+ version: 0.3.10
10
10
  platform: ruby
11
11
  authors:
12
12
  - Jens Kraemer
@@ -67,7 +67,7 @@ extensions: []
67
67
  extra_rdoc_files:
68
68
  - History.txt
69
69
  - Manifest.txt
70
- - README
70
+ - README.rdoc
71
71
  files:
72
72
  - CHANGES
73
73
  - History.txt
@@ -75,7 +75,7 @@ files:
75
75
  - LICENSE
76
76
  - Manifest.txt
77
77
  - rakefile
78
- - README
78
+ - README.rdoc
79
79
  - bin/rdig
80
80
  - doc/examples/config.rb
81
81
  - lib/rdig/content_extractors/doc.rb