rdig 0.3.9 → 0.3.10

Sign up to get free protection for your applications and to get access to all the features.
data/LICENSE CHANGED
@@ -1,4 +1,4 @@
1
- Copyright (c) 2006 Jens Kraemer
1
+ Copyright (c) 2006-2012 Jens Kraemer, jk@jkraemer.net
2
2
 
3
3
  Permission is hereby granted, free of charge, to any person obtaining
4
4
  a copy of this software and associated documentation files (the
@@ -25,7 +25,7 @@ manually do a +gem install rubyful_soup+.
25
25
  === Handle search in your application:
26
26
  require 'rdig'
27
27
  require 'rdig_config' # load your config file here
28
- search_results = RDig.searcher.search(query, options={})
28
+ search_results = RDig.searcher.search(query)
29
29
 
30
30
  see RDig::Search::Searcher for more information.
31
31
 
data/bin/rdig CHANGED
File without changes
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
3
  #--
4
- # Copyright (c) 2006 Jens Kraemer
4
+ # Copyright (c) 2006-2012 Jens Kraemer, jk@jkraemer.net
5
5
  #
6
6
  # Permission is hereby granted, free of charge, to any person obtaining
7
7
  # a copy of this software and associated documentation files (the
@@ -84,7 +84,7 @@ module RDig
84
84
  { RDig::UrlFilters::PathExclusionFilter => :exclude_documents }
85
85
  ]
86
86
  }
87
-
87
+
88
88
  end
89
89
 
90
90
  def application
@@ -150,7 +150,7 @@ module RDig
150
150
  end
151
151
  end
152
152
  alias config configuration
153
-
153
+
154
154
  def logger
155
155
  @logger ||= create_logger
156
156
  end
@@ -200,12 +200,12 @@ module RDig
200
200
  def options
201
201
  @options ||= OpenStruct.new
202
202
  end
203
-
203
+
204
204
  # Display the program usage line.
205
205
  def usage
206
206
  puts "rdig -c configfile {options}"
207
207
  end
208
-
208
+
209
209
  # Display the rake command line help.
210
210
  def help
211
211
  usage
@@ -266,8 +266,7 @@ module RDig
266
266
  rescue
267
267
  puts $!.backtrace
268
268
  fail "No Configfile found!\n#{$!}"
269
-
270
- end
269
+ end
271
270
 
272
271
  puts "using Ferret #{Ferret::VERSION}"
273
272
 
@@ -1,5 +1,5 @@
1
1
  module RDig
2
-
2
+
3
3
  # Contains classes which are used for extracting content and meta data from
4
4
  # various content types.
5
5
  module ContentExtractors
@@ -13,7 +13,7 @@ module RDig
13
13
  # Extractors inheriting from this class will be auto-discovered and used
14
14
  # when can_do returns true
15
15
  class ContentExtractor
16
-
16
+
17
17
  def self.inherited(extractor)
18
18
  super(extractor)
19
19
  self.extractors << extractor
@@ -32,7 +32,7 @@ module RDig
32
32
  ex
33
33
  }.compact
34
34
  end
35
-
35
+
36
36
  def self.process(content, content_type)
37
37
  self.extractor_instances.each { |extractor|
38
38
  return extractor.process(content) if extractor.can_do(content_type)
@@ -61,7 +61,7 @@ module RDig
61
61
  end
62
62
  result
63
63
  end
64
-
64
+
65
65
  def as_file(content)
66
66
  file = Tempfile.new('rdig')
67
67
  file << content
@@ -8,7 +8,7 @@ module RDig
8
8
  #
9
9
  class PdfContentExtractor < ContentExtractor
10
10
  include ExternalAppHelper
11
-
11
+
12
12
  def initialize(config)
13
13
  super(config)
14
14
  @pattern = /^application\/pdf/
@@ -22,7 +22,7 @@ module RDig
22
22
  end
23
23
  }
24
24
  end
25
-
25
+
26
26
  def process(content)
27
27
  result = {}
28
28
  as_file(content) do |file|
@@ -35,7 +35,7 @@ module RDig
35
35
  def get_content(path_to_tempfile)
36
36
  %x{#{@pdftotext} -enc UTF-8 '#{path_to_tempfile}' -}
37
37
  end
38
-
38
+
39
39
  # extracts the title from pdf meta data
40
40
  # needs pdfinfo
41
41
  # returns the title or nil if no title was found
@@ -1,12 +1,11 @@
1
1
  module RDig
2
-
3
-
4
2
  class Crawler
5
-
3
+
6
4
  def initialize(config = RDig.config, logger = RDig.logger)
7
5
  @documents = Queue.new
8
6
  @logger = logger
9
7
  @config = config
8
+ @indexed_documents = 0
10
9
  end
11
10
 
12
11
  def run
@@ -22,6 +21,7 @@ module RDig
22
21
  url_type = @config.crawler.start_urls.first =~ /^file:\/\// ? :file : :http
23
22
  chain_config = RDig.filter_chain[url_type]
24
23
 
24
+ # the etag filter operates on the fetched document, thats why we cannot put it into the filter chain right now.
25
25
  @etag_filter = ETagFilter.new
26
26
  filterchain = UrlFilters::FilterChain.new(chain_config)
27
27
  @config.crawler.start_urls.each { |url| add_url(url, filterchain) }
@@ -31,9 +31,11 @@ module RDig
31
31
  num_threads.times { |i|
32
32
  group.join_nowait Thread.new("fetcher #{i}") {
33
33
  filterchain = UrlFilters::FilterChain.new(chain_config)
34
+ @logger.info "thread #{i} running..."
34
35
  while (doc = @documents.pop) != :exit
35
36
  process_document doc, filterchain
36
37
  end
38
+ @logger.info "thread #{i} is done."
37
39
  }
38
40
  }
39
41
 
@@ -47,20 +49,21 @@ module RDig
47
49
 
48
50
  @logger.info "waiting for threads to finish..."
49
51
  group.all_waits
52
+ @logger.info "indexed #{@indexer.indexed_documents} documents"
50
53
  end
51
54
 
52
55
  def process_document(doc, filterchain)
53
- @logger.debug "processing document #{doc}"
56
+ @logger.info "processing document #{doc.uri}"
54
57
  doc.fetch
55
58
  case doc.status
56
59
  when :success
57
- if @etag_filter.apply(doc)
60
+ if @etag_filter.apply(doc)
58
61
  # add links from this document to the queue
59
62
  doc.content[:links].each { |url|
60
63
  add_url(url, filterchain, doc)
61
64
  } unless doc.content[:links].nil?
62
65
  add_to_index doc
63
- end
66
+ end
64
67
  when :redirect
65
68
  @logger.debug "redirect to #{doc.content}"
66
69
  add_url(doc.content, filterchain, doc)
@@ -69,14 +72,16 @@ module RDig
69
72
  end
70
73
  rescue
71
74
  @logger.error "error processing document #{doc.uri.to_s}: #{$!}"
72
- @logger.debug "Trace: #{$!.backtrace.join("\n")}"
75
+ @logger.info "Trace: #{$!.backtrace.join("\n")}"
73
76
  end
74
77
 
75
78
  def add_to_index(doc)
76
- @indexer << doc if doc.needs_indexing?
79
+ if doc.needs_indexing?
80
+ @indexer << doc
81
+ end
77
82
  end
78
83
 
79
-
84
+
80
85
  # pipes a new document pointing to url through the filter chain,
81
86
  # if it survives that, it gets added to the documents queue for further
82
87
  # processing
@@ -90,19 +95,17 @@ module RDig
90
95
  Document.create(url)
91
96
  end
92
97
 
93
- doc = filterchain.apply(doc)
94
-
95
- if doc
98
+ if doc = filterchain.apply(doc)
96
99
  @documents << doc
97
100
  @logger.debug "url #{url} survived filterchain"
98
101
  end
99
102
  rescue
100
103
  nil
101
104
  end
102
-
105
+
103
106
  end
104
107
 
105
-
108
+
106
109
  # checks fetched documents' E-Tag headers against the list of E-Tags
107
110
  # of the documents already indexed.
108
111
  # This is supposed to help against double-indexing documents which can
@@ -1,14 +1,14 @@
1
1
  module RDig
2
-
2
+
3
3
  #
4
4
  # Document base class
5
5
  #
6
6
  class Document
7
-
7
+
8
8
  attr_reader :uri
9
9
  attr_reader :content
10
10
  attr_reader :content_type
11
-
11
+
12
12
  def self.create(url)
13
13
  return case url
14
14
  when /^https?:\/\//i
@@ -32,7 +32,7 @@ module RDig
32
32
  def title; @content[:title] end
33
33
  def body; @content[:content] end
34
34
  def links; @content[:links] end
35
-
35
+
36
36
  def needs_indexing?
37
37
  has_content? && (title || body)
38
38
  end
@@ -47,7 +47,7 @@ module RDig
47
47
 
48
48
  end
49
49
 
50
-
50
+
51
51
  #
52
52
  # Document in a File system
53
53
  #
@@ -90,8 +90,8 @@ module RDig
90
90
  end
91
91
 
92
92
  end
93
-
94
-
93
+
94
+
95
95
  #
96
96
  # Remote Document to be retrieved by HTTP
97
97
  #
@@ -106,7 +106,7 @@ module RDig
106
106
  def create_child(uri)
107
107
  HttpDocument.new(:uri => uri, :referrer => self.uri, :depth => self.depth+1) unless uri =~ /^file:\/\//i
108
108
  end
109
-
109
+
110
110
  # url: url of this document, may be relative to the referring doc or host.
111
111
  # referrer: uri of the document we retrieved this link from
112
112
  def initialize(args={})
@@ -1,11 +1,14 @@
1
1
  module RDig
2
2
  module Index
3
-
3
+
4
4
  # used by the crawler to build the ferret index
5
5
  class Indexer
6
6
  include MonitorMixin
7
-
7
+
8
+ attr_reader :indexed_documents
9
+
8
10
  def initialize(settings)
11
+ @indexed_documents = 0
9
12
  @config = settings
10
13
  @index_writer = Ferret::Index::IndexWriter.new(
11
14
  :path => settings.path,
@@ -13,7 +16,7 @@ module RDig
13
16
  :analyzer => settings.analyzer)
14
17
  super() # scary, MonitorMixin won't initialize if we don't call super() here (parens matter)
15
18
  end
16
-
19
+
17
20
  def add_to_index(document)
18
21
  RDig.logger.debug "add to index: #{document.uri.to_s}"
19
22
  @config.rewrite_uri.call(document.uri) if @config.rewrite_uri
@@ -25,16 +28,17 @@ module RDig
25
28
  }
26
29
  synchronize do
27
30
  @index_writer << doc
31
+ @indexed_documents += 1
28
32
  end
29
33
  end
30
34
  alias :<< :add_to_index
31
-
35
+
32
36
  def close
33
37
  @index_writer.optimize
34
38
  @index_writer.close
35
39
  @index_writer = nil
36
40
  end
37
41
  end
38
-
42
+
39
43
  end
40
44
  end
@@ -5,17 +5,17 @@ module RDig
5
5
  # Call RDig::searcher to retrieve an instance ready for use.
6
6
  class Searcher
7
7
  include Ferret::Search
8
-
8
+
9
9
  # the query parser used to parse query strings
10
10
  attr_reader :query_parser
11
-
11
+
12
12
  # takes the ferret section of the rdig configuration as a parameter.
13
13
  def initialize(settings)
14
14
  @ferret_config = settings
15
15
  @query_parser = Ferret::QueryParser.new(settings.marshal_dump)
16
16
  ferret_searcher
17
17
  end
18
-
18
+
19
19
  # returns the Ferret::Search::IndexSearcher instance used internally.
20
20
  def ferret_searcher
21
21
  if @ferret_searcher and !@ferret_searcher.reader.latest?
@@ -29,7 +29,14 @@ module RDig
29
29
  end
30
30
  @ferret_searcher
31
31
  end
32
-
32
+
33
+ def get_maximum_score(query, options)
34
+ ferret_searcher.search_each(query, options.merge(:limit => 1, :offset => 0)) do |doc_id, score|
35
+ return score
36
+ end
37
+ 0
38
+ end
39
+
33
40
  # run a search.
34
41
  # +query+ usually will be a user-entered string. See the Ferret query
35
42
  # language[http://ferret.davebalmain.com/api/classes/Ferret/QueryParser.html]
@@ -46,23 +53,26 @@ module RDig
46
53
  RDig.logger.info "Query: #{query}"
47
54
  results = []
48
55
  searcher = ferret_searcher
56
+ maximum_score = get_maximum_score query, options
49
57
  result[:hitcount] = searcher.search_each(query, options) do |doc_id, score|
50
58
  doc = searcher[doc_id]
51
59
  results << { :score => score,
52
60
  :title => doc[:title],
53
61
  :url => doc[:url],
54
- :extract => build_extract(doc[:data]) }
62
+ :extract => build_extract(doc[:data]),
63
+ :relative_score => (score / maximum_score)
64
+ }
55
65
  end
56
66
  result[:list] = results
57
67
  result
58
68
  end
59
-
69
+
60
70
  def build_extract(data)
61
71
  (data && data.length > 200) ? data[0..200] : data
62
72
  end
63
-
73
+
64
74
  end
65
-
75
+
66
76
  # class SearchResult < OpenStruct
67
77
  # def initialize(doc, score)
68
78
  # self.score = score
@@ -72,6 +82,6 @@ module RDig
72
82
  # end
73
83
  # end
74
84
 
75
-
85
+
76
86
  end
77
87
  end
@@ -1,9 +1,10 @@
1
1
  module RDig
2
-
2
+
3
3
  module UrlFilters
4
4
 
5
5
  class FilterChain
6
6
  def initialize(chain_config)
7
+ @logger = RDig.logger
7
8
  @filters = []
8
9
  chain_config.each { |filter|
9
10
  case filter
@@ -29,11 +30,23 @@ module RDig
29
30
  when Symbol
30
31
  if args.nil?
31
32
  @filters << lambda { |document|
32
- UrlFilters.send(filter, document)
33
+ begin
34
+ UrlFilters.send(filter, document)
35
+ rescue Exception
36
+ @logger.error "error in URL filter #{filter}: #{$!}"
37
+ @logger.error $!.backtrace.join("\n")
38
+ nil
39
+ end
33
40
  }
34
41
  else
35
42
  @filters << lambda { |document|
36
- UrlFilters.send(filter, document, args)
43
+ begin
44
+ UrlFilters.send(filter, document, args)
45
+ rescue Exception
46
+ @logger.error "error in URL filter #{filter}: #{$!}"
47
+ @logger.error $!.backtrace.join("\n")
48
+ nil
49
+ end
37
50
  }
38
51
  end
39
52
  when Class
@@ -54,7 +67,13 @@ module RDig
54
67
 
55
68
  def apply(document)
56
69
  @filters.each { |filter|
57
- return nil unless filter.call(document)
70
+ @logger.debug "running filter #{filter.inspect} on doc #{document.uri}"
71
+ unless filter.call(document)
72
+ @logger.debug "fail"
73
+ return nil
74
+ else
75
+ @logger.debug 'OK'
76
+ end
58
77
  }
59
78
  return document
60
79
  end
@@ -75,7 +94,7 @@ module RDig
75
94
  # nil otherwise
76
95
  def apply(document)
77
96
  synchronize do
78
- @visited_urls.add?(document.uri.to_s) ? document : nil
97
+ @visited_urls.add?(document.uri.to_s) ? document : nil
79
98
  end
80
99
  end
81
100
  end
@@ -174,7 +193,7 @@ module RDig
174
193
  uri.host = ref.host unless uri.host
175
194
  uri.port = ref.port unless uri.port || ref.port==ref.default_port
176
195
  uri.path = ref.path unless uri.path
177
-
196
+
178
197
  old_uri_path = uri.path
179
198
  if uri.path !~ /^\// || uri.path =~ /^\.\./
180
199
  ref_path = ref.path || '/'
@@ -202,7 +221,7 @@ module RDig
202
221
  if document.uri.path =~ /\/$/
203
222
  # append index document if configured
204
223
  if cfg.index_document
205
- document.uri.path << RDig.config.index_document
224
+ document.uri.path << cfg.index_document
206
225
  elsif cfg.remove_trailing_slash
207
226
  document.uri.path.gsub! /\/$/, ''
208
227
  end
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 3
8
- - 9
9
- version: 0.3.9
8
+ - 10
9
+ version: 0.3.10
10
10
  platform: ruby
11
11
  authors:
12
12
  - Jens Kraemer
@@ -67,7 +67,7 @@ extensions: []
67
67
  extra_rdoc_files:
68
68
  - History.txt
69
69
  - Manifest.txt
70
- - README
70
+ - README.rdoc
71
71
  files:
72
72
  - CHANGES
73
73
  - History.txt
@@ -75,7 +75,7 @@ files:
75
75
  - LICENSE
76
76
  - Manifest.txt
77
77
  - rakefile
78
- - README
78
+ - README.rdoc
79
79
  - bin/rdig
80
80
  - doc/examples/config.rb
81
81
  - lib/rdig/content_extractors/doc.rb