rdig 0.3.10 → 0.3.11
Sign up to get free protection for your applications and to get access to all the features.
- data/doc/examples/config.rb +25 -9
- data/lib/rdig/search.rb +35 -9
- data/lib/rdig.rb +5 -5
- metadata +2 -2
data/doc/examples/config.rb
CHANGED
@@ -8,7 +8,7 @@ RDig.configuration do |cfg|
|
|
8
8
|
|
9
9
|
# log level, set to :debug, :info, :warn or :error
|
10
10
|
cfg.log_level = :info
|
11
|
-
|
11
|
+
|
12
12
|
# provide one or more URLs for the crawler to start from
|
13
13
|
cfg.crawler.start_urls = [ 'http://www.example.com/' ]
|
14
14
|
|
@@ -26,15 +26,19 @@ RDig.configuration do |cfg|
|
|
26
26
|
# caution, existing contents of this directory will be deleted!
|
27
27
|
cfg.index.path = '/path/to/index'
|
28
28
|
|
29
|
+
# Set the analyzer to use.
|
30
|
+
# See the Ferret::Analysis package rdoc for more info about analyzers.
|
31
|
+
# cfg.index.analyzer = Ferret::Analysis::StandardAnalyzer.new
|
32
|
+
|
29
33
|
##################################################################
|
30
34
|
# options you might want to set, the given values are the defaults
|
31
35
|
|
32
36
|
# set to true to get stack traces on errors
|
33
37
|
# cfg.verbose = false
|
34
|
-
|
38
|
+
|
35
39
|
# content extraction options
|
36
40
|
cfg.content_extraction = OpenStruct.new(
|
37
|
-
|
41
|
+
|
38
42
|
# HPRICOT configuration
|
39
43
|
# hpricot is the html parsing lib used by RDig. See
|
40
44
|
# http://code.whytheluckystiff.net/hpricot for usage information.
|
@@ -52,11 +56,11 @@ RDig.configuration do |cfg|
|
|
52
56
|
)
|
53
57
|
|
54
58
|
# crawler options
|
55
|
-
|
59
|
+
|
56
60
|
# Notice: for file system crawling the include/exclude_document patterns are
|
57
61
|
# applied to the full path of _files_ only (like /home/bob/test.pdf),
|
58
62
|
# for http to full URIs (like http://example.com/index.html).
|
59
|
-
|
63
|
+
|
60
64
|
# nil (include all documents) or an array of Regexps
|
61
65
|
# matching the URLs you want to index.
|
62
66
|
# cfg.crawler.include_documents = nil
|
@@ -67,7 +71,7 @@ RDig.configuration do |cfg|
|
|
67
71
|
# to exclude documents here that aren't wanted but would be
|
68
72
|
# included by the inclusion patterns.
|
69
73
|
# cfg.crawler.exclude_documents = nil
|
70
|
-
|
74
|
+
|
71
75
|
# number of document fetching threads to use. Should be raised only if
|
72
76
|
# your CPU has idle time when indexing.
|
73
77
|
# cfg.crawler.num_threads = 2
|
@@ -85,14 +89,14 @@ RDig.configuration do |cfg|
|
|
85
89
|
# limit the crawling depth. Default: nil (unlimited)
|
86
90
|
# Set to 0 to only index the start_urls.
|
87
91
|
# cfg.crawler.max_depth = nil
|
88
|
-
|
92
|
+
|
89
93
|
# default index document to be appended to URIs ending with a trailing '/'
|
90
94
|
# cfg.crawler.normalize_uri.index_document = nil
|
91
95
|
# strip trailing '/' from URIs to avoid double indexing of pages referred by '
|
92
96
|
# Ignored if index_document is set.
|
93
97
|
# Not necessary when the server issues proper etags since the default etag filter will kill these doublettes.
|
94
98
|
# cfg.crawler.normalize_uri.remove_trailing_slash = nil
|
95
|
-
|
99
|
+
|
96
100
|
# http proxy configuration
|
97
101
|
# proxy url
|
98
102
|
# cfg.crawler.http_proxy = nil
|
@@ -107,6 +111,18 @@ RDig.configuration do |cfg|
|
|
107
111
|
|
108
112
|
# indexer options
|
109
113
|
|
114
|
+
# RDig defaults to AND queries:
|
115
|
+
# "foo bar" -> foo AND bar
|
116
|
+
# "foo OR bar" -> foo OR bar
|
117
|
+
# "foo AND bar" -> foo AND bar
|
118
|
+
# cfg.index.or_default = false
|
119
|
+
|
120
|
+
# set to true to get the default ferret behaviour:
|
121
|
+
# "foo bar" -> foo OR bar
|
122
|
+
# "foo OR bar" -> foo OR bar
|
123
|
+
# "foo AND bar" -> foo AND bar
|
124
|
+
# cfg.index.or_default = true
|
125
|
+
|
110
126
|
# create a new index on each run. Will append to the index if false. Use when
|
111
127
|
# building a single index from multiple runs, e.g. one across a website and the
|
112
128
|
# other a tree in a local file system
|
@@ -121,5 +137,5 @@ RDig.configuration do |cfg|
|
|
121
137
|
# uri.scheme = 'http'
|
122
138
|
# uri.host = 'www.mydomain.com'
|
123
139
|
# }
|
124
|
-
|
140
|
+
|
125
141
|
end
|
data/lib/rdig/search.rb
CHANGED
@@ -9,7 +9,7 @@ module RDig
|
|
9
9
|
# the query parser used to parse query strings
|
10
10
|
attr_reader :query_parser
|
11
11
|
|
12
|
-
# takes the
|
12
|
+
# takes the index section of the rdig configuration as a parameter.
|
13
13
|
def initialize(settings)
|
14
14
|
@ferret_config = settings
|
15
15
|
@query_parser = Ferret::QueryParser.new(settings.marshal_dump)
|
@@ -37,38 +37,64 @@ module RDig
|
|
37
37
|
0
|
38
38
|
end
|
39
39
|
|
40
|
-
# run a search.
|
40
|
+
# run a search.
|
41
|
+
#
|
41
42
|
# +query+ usually will be a user-entered string. See the Ferret query
|
42
43
|
# language[http://ferret.davebalmain.com/api/classes/Ferret/QueryParser.html]
|
43
44
|
# for more information on queries.
|
44
45
|
# A Ferret::Search::Query instance may be given, too.
|
45
|
-
#
|
46
|
+
#
|
46
47
|
# Some of the more often used otions are:
|
47
48
|
# offset:: first document in result list to retrieve (0-based). The default is 0.
|
48
49
|
# limit:: number of documents to retrieve. The default is 10.
|
50
|
+
# highlight:: hash to configure excerpt highlighting, e.g.
|
51
|
+
# :highlight => { :pre_tag => '<b>',
|
52
|
+
# :post_tag => '</b>',
|
53
|
+
# :ellipsis => '…',
|
54
|
+
# :excerpt_length => 50,
|
55
|
+
# :num_excerpts => 3 }
|
56
|
+
# You may just set :highlight => true to go with the defaults, or use a hash to
|
57
|
+
# override those default values.
|
58
|
+
#
|
49
59
|
# Please see the Ferret::Search::Searcher API for more options.
|
50
60
|
def search(query, options={})
|
51
61
|
result = {}
|
52
|
-
query =
|
62
|
+
query = process_query query
|
53
63
|
RDig.logger.info "Query: #{query}"
|
54
64
|
results = []
|
55
65
|
searcher = ferret_searcher
|
56
66
|
maximum_score = get_maximum_score query, options
|
57
67
|
result[:hitcount] = searcher.search_each(query, options) do |doc_id, score|
|
58
68
|
doc = searcher[doc_id]
|
59
|
-
results << { :score => score,
|
60
|
-
:title => doc[:title],
|
61
|
-
:url => doc[:url],
|
69
|
+
results << { :score => score,
|
70
|
+
:title => doc[:title],
|
71
|
+
:url => doc[:url],
|
62
72
|
:extract => build_extract(doc[:data]),
|
63
|
-
:relative_score => (score / maximum_score)
|
73
|
+
:relative_score => (score / maximum_score),
|
74
|
+
:doc_id => doc_id
|
64
75
|
}
|
65
76
|
end
|
77
|
+
if highlight_opts = options[:highlight]
|
78
|
+
highlight_opts = { :pre_tag => '<b>',
|
79
|
+
:post_tag => '</b>',
|
80
|
+
:ellipsis => '…',
|
81
|
+
:excerpt_length => 50,
|
82
|
+
:num_excerpts => 3 }.merge(Hash === highlight_opts ? highlight_opts : {})
|
83
|
+
results.each do |r|
|
84
|
+
r[:extract] = searcher.highlight(query, r[:doc_id], :data, highlight_opts)
|
85
|
+
end
|
86
|
+
end
|
66
87
|
result[:list] = results
|
67
88
|
result
|
68
89
|
end
|
69
90
|
|
91
|
+
def process_query(query)
|
92
|
+
query = query_parser.parse(query) if query.is_a?(String)
|
93
|
+
return query
|
94
|
+
end
|
95
|
+
|
70
96
|
def build_extract(data)
|
71
|
-
(data && data.length > 200) ? data[0..200] : data
|
97
|
+
(data && data.length > 200) ? data[0..200] : data
|
72
98
|
end
|
73
99
|
|
74
100
|
end
|
data/lib/rdig.rb
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
#--
|
4
4
|
# Copyright (c) 2006-2012 Jens Kraemer, jk@jkraemer.net
|
5
|
-
#
|
5
|
+
#
|
6
6
|
# Permission is hereby granted, free of charge, to any person obtaining
|
7
7
|
# a copy of this software and associated documentation files (the
|
8
8
|
# "Software"), to deal in the Software without restriction, including
|
@@ -10,10 +10,10 @@
|
|
10
10
|
# distribute, sublicense, and/or sell copies of the Software, and to
|
11
11
|
# permit persons to whom the Software is furnished to do so, subject to
|
12
12
|
# the following conditions:
|
13
|
-
#
|
13
|
+
#
|
14
14
|
# The above copyright notice and this permission notice shall be
|
15
15
|
# included in all copies or substantial portions of the Software.
|
16
|
-
#
|
16
|
+
#
|
17
17
|
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
18
18
|
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
19
19
|
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
@@ -24,7 +24,7 @@
|
|
24
24
|
#++
|
25
25
|
#
|
26
26
|
|
27
|
-
RDIGVERSION = '0.3.
|
27
|
+
RDIGVERSION = '0.3.10'
|
28
28
|
|
29
29
|
|
30
30
|
require 'thread'
|
@@ -143,7 +143,7 @@ module RDig
|
|
143
143
|
:create => true,
|
144
144
|
:handle_parse_errors => true,
|
145
145
|
:analyzer => Ferret::Analysis::StandardAnalyzer.new,
|
146
|
-
:
|
146
|
+
:or_default => false,
|
147
147
|
:default_field => '*'
|
148
148
|
)
|
149
149
|
)
|