rdig 0.3.10 → 0.3.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,7 +8,7 @@ RDig.configuration do |cfg|
8
8
 
9
9
  # log level, set to :debug, :info, :warn or :error
10
10
  cfg.log_level = :info
11
-
11
+
12
12
  # provide one or more URLs for the crawler to start from
13
13
  cfg.crawler.start_urls = [ 'http://www.example.com/' ]
14
14
 
@@ -26,15 +26,19 @@ RDig.configuration do |cfg|
26
26
  # caution, existing contents of this directory will be deleted!
27
27
  cfg.index.path = '/path/to/index'
28
28
 
29
+ # Set the analyzer to use.
30
+ # See the Ferret::Analysis package rdoc for more info about analyzers.
31
+ # cfg.index.analyzer = Ferret::Analysis::StandardAnalyzer.new
32
+
29
33
  ##################################################################
30
34
  # options you might want to set, the given values are the defaults
31
35
 
32
36
  # set to true to get stack traces on errors
33
37
  # cfg.verbose = false
34
-
38
+
35
39
  # content extraction options
36
40
  cfg.content_extraction = OpenStruct.new(
37
-
41
+
38
42
  # HPRICOT configuration
39
43
  # hpricot is the html parsing lib used by RDig. See
40
44
  # http://code.whytheluckystiff.net/hpricot for usage information.
@@ -52,11 +56,11 @@ RDig.configuration do |cfg|
52
56
  )
53
57
 
54
58
  # crawler options
55
-
59
+
56
60
  # Notice: for file system crawling the include/exclude_document patterns are
57
61
  # applied to the full path of _files_ only (like /home/bob/test.pdf),
58
62
  # for http to full URIs (like http://example.com/index.html).
59
-
63
+
60
64
  # nil (include all documents) or an array of Regexps
61
65
  # matching the URLs you want to index.
62
66
  # cfg.crawler.include_documents = nil
@@ -67,7 +71,7 @@ RDig.configuration do |cfg|
67
71
  # to exclude documents here that aren't wanted but would be
68
72
  # included by the inclusion patterns.
69
73
  # cfg.crawler.exclude_documents = nil
70
-
74
+
71
75
  # number of document fetching threads to use. Should be raised only if
72
76
  # your CPU has idle time when indexing.
73
77
  # cfg.crawler.num_threads = 2
@@ -85,14 +89,14 @@ RDig.configuration do |cfg|
85
89
  # limit the crawling depth. Default: nil (unlimited)
86
90
  # Set to 0 to only index the start_urls.
87
91
  # cfg.crawler.max_depth = nil
88
-
92
+
89
93
  # default index document to be appended to URIs ending with a trailing '/'
90
94
  # cfg.crawler.normalize_uri.index_document = nil
91
95
  # strip trailing '/' from URIs to avoid double indexing of pages referred by '
92
96
  # Ignored if index_document is set.
93
97
  # Not necessary when the server issues proper etags since the default etag filter will kill these doublettes.
94
98
  # cfg.crawler.normalize_uri.remove_trailing_slash = nil
95
-
99
+
96
100
  # http proxy configuration
97
101
  # proxy url
98
102
  # cfg.crawler.http_proxy = nil
@@ -107,6 +111,18 @@ RDig.configuration do |cfg|
107
111
 
108
112
  # indexer options
109
113
 
114
+ # RDig defaults to AND queries:
115
+ # "foo bar" -> foo AND bar
116
+ # "foo OR bar" -> foo OR bar
117
+ # "foo AND bar" -> foo AND bar
118
+ # cfg.index.or_default = false
119
+
120
+ # set to true to get the default ferret behaviour:
121
+ # "foo bar" -> foo OR bar
122
+ # "foo OR bar" -> foo OR bar
123
+ # "foo AND bar" -> foo AND bar
124
+ # cfg.index.or_default = true
125
+
110
126
  # create a new index on each run. Will append to the index if false. Use when
111
127
  # building a single index from multiple runs, e.g. one across a website and the
112
128
  # other a tree in a local file system
@@ -121,5 +137,5 @@ RDig.configuration do |cfg|
121
137
  # uri.scheme = 'http'
122
138
  # uri.host = 'www.mydomain.com'
123
139
  # }
124
-
140
+
125
141
  end
data/lib/rdig/search.rb CHANGED
@@ -9,7 +9,7 @@ module RDig
9
9
  # the query parser used to parse query strings
10
10
  attr_reader :query_parser
11
11
 
12
- # takes the ferret section of the rdig configuration as a parameter.
12
+ # takes the index section of the rdig configuration as a parameter.
13
13
  def initialize(settings)
14
14
  @ferret_config = settings
15
15
  @query_parser = Ferret::QueryParser.new(settings.marshal_dump)
@@ -37,38 +37,64 @@ module RDig
37
37
  0
38
38
  end
39
39
 
40
- # run a search.
40
+ # run a search.
41
+ #
41
42
  # +query+ usually will be a user-entered string. See the Ferret query
42
43
  # language[http://ferret.davebalmain.com/api/classes/Ferret/QueryParser.html]
43
44
  # for more information on queries.
44
45
  # A Ferret::Search::Query instance may be given, too.
45
- #
46
+ #
46
47
  # Some of the more often used otions are:
47
48
  # offset:: first document in result list to retrieve (0-based). The default is 0.
48
49
  # limit:: number of documents to retrieve. The default is 10.
50
+ # highlight:: hash to configure excerpt highlighting, e.g.
51
+ # :highlight => { :pre_tag => '<b>',
52
+ # :post_tag => '</b>',
53
+ # :ellipsis => '&hellip;',
54
+ # :excerpt_length => 50,
55
+ # :num_excerpts => 3 }
56
+ # You may just set :highlight => true to go with the defaults, or use a hash to
57
+ # override those default values.
58
+ #
49
59
  # Please see the Ferret::Search::Searcher API for more options.
50
60
  def search(query, options={})
51
61
  result = {}
52
- query = query_parser.parse(query) if query.is_a?(String)
62
+ query = process_query query
53
63
  RDig.logger.info "Query: #{query}"
54
64
  results = []
55
65
  searcher = ferret_searcher
56
66
  maximum_score = get_maximum_score query, options
57
67
  result[:hitcount] = searcher.search_each(query, options) do |doc_id, score|
58
68
  doc = searcher[doc_id]
59
- results << { :score => score,
60
- :title => doc[:title],
61
- :url => doc[:url],
69
+ results << { :score => score,
70
+ :title => doc[:title],
71
+ :url => doc[:url],
62
72
  :extract => build_extract(doc[:data]),
63
- :relative_score => (score / maximum_score)
73
+ :relative_score => (score / maximum_score),
74
+ :doc_id => doc_id
64
75
  }
65
76
  end
77
+ if highlight_opts = options[:highlight]
78
+ highlight_opts = { :pre_tag => '<b>',
79
+ :post_tag => '</b>',
80
+ :ellipsis => '&hellip;',
81
+ :excerpt_length => 50,
82
+ :num_excerpts => 3 }.merge(Hash === highlight_opts ? highlight_opts : {})
83
+ results.each do |r|
84
+ r[:extract] = searcher.highlight(query, r[:doc_id], :data, highlight_opts)
85
+ end
86
+ end
66
87
  result[:list] = results
67
88
  result
68
89
  end
69
90
 
91
+ def process_query(query)
92
+ query = query_parser.parse(query) if query.is_a?(String)
93
+ return query
94
+ end
95
+
70
96
  def build_extract(data)
71
- (data && data.length > 200) ? data[0..200] : data
97
+ (data && data.length > 200) ? data[0..200] : data
72
98
  end
73
99
 
74
100
  end
data/lib/rdig.rb CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  #--
4
4
  # Copyright (c) 2006-2012 Jens Kraemer, jk@jkraemer.net
5
- #
5
+ #
6
6
  # Permission is hereby granted, free of charge, to any person obtaining
7
7
  # a copy of this software and associated documentation files (the
8
8
  # "Software"), to deal in the Software without restriction, including
@@ -10,10 +10,10 @@
10
10
  # distribute, sublicense, and/or sell copies of the Software, and to
11
11
  # permit persons to whom the Software is furnished to do so, subject to
12
12
  # the following conditions:
13
- #
13
+ #
14
14
  # The above copyright notice and this permission notice shall be
15
15
  # included in all copies or substantial portions of the Software.
16
- #
16
+ #
17
17
  # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18
18
  # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
19
19
  # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
@@ -24,7 +24,7 @@
24
24
  #++
25
25
  #
26
26
 
27
- RDIGVERSION = '0.3.8'
27
+ RDIGVERSION = '0.3.10'
28
28
 
29
29
 
30
30
  require 'thread'
@@ -143,7 +143,7 @@ module RDig
143
143
  :create => true,
144
144
  :handle_parse_errors => true,
145
145
  :analyzer => Ferret::Analysis::StandardAnalyzer.new,
146
- :occur_default => :must,
146
+ :or_default => false,
147
147
  :default_field => '*'
148
148
  )
149
149
  )
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 3
8
- - 10
9
- version: 0.3.10
8
+ - 11
9
+ version: 0.3.11
10
10
  platform: ruby
11
11
  authors:
12
12
  - Jens Kraemer