rdig 0.3.10 → 0.3.11

Sign up to get free protection for your applications and to get access to all the features.
@@ -8,7 +8,7 @@ RDig.configuration do |cfg|
8
8
 
9
9
  # log level, set to :debug, :info, :warn or :error
10
10
  cfg.log_level = :info
11
-
11
+
12
12
  # provide one or more URLs for the crawler to start from
13
13
  cfg.crawler.start_urls = [ 'http://www.example.com/' ]
14
14
 
@@ -26,15 +26,19 @@ RDig.configuration do |cfg|
26
26
  # caution, existing contents of this directory will be deleted!
27
27
  cfg.index.path = '/path/to/index'
28
28
 
29
+ # Set the analyzer to use.
30
+ # See the Ferret::Analysis package rdoc for more info about analyzers.
31
+ # cfg.index.analyzer = Ferret::Analysis::StandardAnalyzer.new
32
+
29
33
  ##################################################################
30
34
  # options you might want to set, the given values are the defaults
31
35
 
32
36
  # set to true to get stack traces on errors
33
37
  # cfg.verbose = false
34
-
38
+
35
39
  # content extraction options
36
40
  cfg.content_extraction = OpenStruct.new(
37
-
41
+
38
42
  # HPRICOT configuration
39
43
  # hpricot is the html parsing lib used by RDig. See
40
44
  # http://code.whytheluckystiff.net/hpricot for usage information.
@@ -52,11 +56,11 @@ RDig.configuration do |cfg|
52
56
  )
53
57
 
54
58
  # crawler options
55
-
59
+
56
60
  # Notice: for file system crawling the include/exclude_document patterns are
57
61
  # applied to the full path of _files_ only (like /home/bob/test.pdf),
58
62
  # for http to full URIs (like http://example.com/index.html).
59
-
63
+
60
64
  # nil (include all documents) or an array of Regexps
61
65
  # matching the URLs you want to index.
62
66
  # cfg.crawler.include_documents = nil
@@ -67,7 +71,7 @@ RDig.configuration do |cfg|
67
71
  # to exclude documents here that aren't wanted but would be
68
72
  # included by the inclusion patterns.
69
73
  # cfg.crawler.exclude_documents = nil
70
-
74
+
71
75
  # number of document fetching threads to use. Should be raised only if
72
76
  # your CPU has idle time when indexing.
73
77
  # cfg.crawler.num_threads = 2
@@ -85,14 +89,14 @@ RDig.configuration do |cfg|
85
89
  # limit the crawling depth. Default: nil (unlimited)
86
90
  # Set to 0 to only index the start_urls.
87
91
  # cfg.crawler.max_depth = nil
88
-
92
+
89
93
  # default index document to be appended to URIs ending with a trailing '/'
90
94
  # cfg.crawler.normalize_uri.index_document = nil
91
95
  # strip trailing '/' from URIs to avoid double indexing of pages referred by '
92
96
  # Ignored if index_document is set.
93
97
  # Not necessary when the server issues proper etags since the default etag filter will kill these doublettes.
94
98
  # cfg.crawler.normalize_uri.remove_trailing_slash = nil
95
-
99
+
96
100
  # http proxy configuration
97
101
  # proxy url
98
102
  # cfg.crawler.http_proxy = nil
@@ -107,6 +111,18 @@ RDig.configuration do |cfg|
107
111
 
108
112
  # indexer options
109
113
 
114
+ # RDig defaults to AND queries:
115
+ # "foo bar" -> foo AND bar
116
+ # "foo OR bar" -> foo OR bar
117
+ # "foo AND bar" -> foo AND bar
118
+ # cfg.index.or_default = false
119
+
120
+ # set to true to get the default ferret behaviour:
121
+ # "foo bar" -> foo OR bar
122
+ # "foo OR bar" -> foo OR bar
123
+ # "foo AND bar" -> foo AND bar
124
+ # cfg.index.or_default = true
125
+
110
126
  # create a new index on each run. Will append to the index if false. Use when
111
127
  # building a single index from multiple runs, e.g. one across a website and the
112
128
  # other a tree in a local file system
@@ -121,5 +137,5 @@ RDig.configuration do |cfg|
121
137
  # uri.scheme = 'http'
122
138
  # uri.host = 'www.mydomain.com'
123
139
  # }
124
-
140
+
125
141
  end
data/lib/rdig/search.rb CHANGED
@@ -9,7 +9,7 @@ module RDig
9
9
  # the query parser used to parse query strings
10
10
  attr_reader :query_parser
11
11
 
12
- # takes the ferret section of the rdig configuration as a parameter.
12
+ # takes the index section of the rdig configuration as a parameter.
13
13
  def initialize(settings)
14
14
  @ferret_config = settings
15
15
  @query_parser = Ferret::QueryParser.new(settings.marshal_dump)
@@ -37,38 +37,64 @@ module RDig
37
37
  0
38
38
  end
39
39
 
40
- # run a search.
40
+ # run a search.
41
+ #
41
42
  # +query+ usually will be a user-entered string. See the Ferret query
42
43
  # language[http://ferret.davebalmain.com/api/classes/Ferret/QueryParser.html]
43
44
  # for more information on queries.
44
45
  # A Ferret::Search::Query instance may be given, too.
45
- #
46
+ #
46
47
  # Some of the more often used otions are:
47
48
  # offset:: first document in result list to retrieve (0-based). The default is 0.
48
49
  # limit:: number of documents to retrieve. The default is 10.
50
+ # highlight:: hash to configure excerpt highlighting, e.g.
51
+ # :highlight => { :pre_tag => '<b>',
52
+ # :post_tag => '</b>',
53
+ # :ellipsis => '&hellip;',
54
+ # :excerpt_length => 50,
55
+ # :num_excerpts => 3 }
56
+ # You may just set :highlight => true to go with the defaults, or use a hash to
57
+ # override those default values.
58
+ #
49
59
  # Please see the Ferret::Search::Searcher API for more options.
50
60
  def search(query, options={})
51
61
  result = {}
52
- query = query_parser.parse(query) if query.is_a?(String)
62
+ query = process_query query
53
63
  RDig.logger.info "Query: #{query}"
54
64
  results = []
55
65
  searcher = ferret_searcher
56
66
  maximum_score = get_maximum_score query, options
57
67
  result[:hitcount] = searcher.search_each(query, options) do |doc_id, score|
58
68
  doc = searcher[doc_id]
59
- results << { :score => score,
60
- :title => doc[:title],
61
- :url => doc[:url],
69
+ results << { :score => score,
70
+ :title => doc[:title],
71
+ :url => doc[:url],
62
72
  :extract => build_extract(doc[:data]),
63
- :relative_score => (score / maximum_score)
73
+ :relative_score => (score / maximum_score),
74
+ :doc_id => doc_id
64
75
  }
65
76
  end
77
+ if highlight_opts = options[:highlight]
78
+ highlight_opts = { :pre_tag => '<b>',
79
+ :post_tag => '</b>',
80
+ :ellipsis => '&hellip;',
81
+ :excerpt_length => 50,
82
+ :num_excerpts => 3 }.merge(Hash === highlight_opts ? highlight_opts : {})
83
+ results.each do |r|
84
+ r[:extract] = searcher.highlight(query, r[:doc_id], :data, highlight_opts)
85
+ end
86
+ end
66
87
  result[:list] = results
67
88
  result
68
89
  end
69
90
 
91
+ def process_query(query)
92
+ query = query_parser.parse(query) if query.is_a?(String)
93
+ return query
94
+ end
95
+
70
96
  def build_extract(data)
71
- (data && data.length > 200) ? data[0..200] : data
97
+ (data && data.length > 200) ? data[0..200] : data
72
98
  end
73
99
 
74
100
  end
data/lib/rdig.rb CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  #--
4
4
  # Copyright (c) 2006-2012 Jens Kraemer, jk@jkraemer.net
5
- #
5
+ #
6
6
  # Permission is hereby granted, free of charge, to any person obtaining
7
7
  # a copy of this software and associated documentation files (the
8
8
  # "Software"), to deal in the Software without restriction, including
@@ -10,10 +10,10 @@
10
10
  # distribute, sublicense, and/or sell copies of the Software, and to
11
11
  # permit persons to whom the Software is furnished to do so, subject to
12
12
  # the following conditions:
13
- #
13
+ #
14
14
  # The above copyright notice and this permission notice shall be
15
15
  # included in all copies or substantial portions of the Software.
16
- #
16
+ #
17
17
  # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18
18
  # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
19
19
  # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
@@ -24,7 +24,7 @@
24
24
  #++
25
25
  #
26
26
 
27
- RDIGVERSION = '0.3.8'
27
+ RDIGVERSION = '0.3.10'
28
28
 
29
29
 
30
30
  require 'thread'
@@ -143,7 +143,7 @@ module RDig
143
143
  :create => true,
144
144
  :handle_parse_errors => true,
145
145
  :analyzer => Ferret::Analysis::StandardAnalyzer.new,
146
- :occur_default => :must,
146
+ :or_default => false,
147
147
  :default_field => '*'
148
148
  )
149
149
  )
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 3
8
- - 10
9
- version: 0.3.10
8
+ - 11
9
+ version: 0.3.11
10
10
  platform: ruby
11
11
  authors:
12
12
  - Jens Kraemer