rdig 0.3.10 → 0.3.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/doc/examples/config.rb +25 -9
- data/lib/rdig/search.rb +35 -9
- data/lib/rdig.rb +5 -5
- metadata +2 -2
data/doc/examples/config.rb
CHANGED
@@ -8,7 +8,7 @@ RDig.configuration do |cfg|
|
|
8
8
|
|
9
9
|
# log level, set to :debug, :info, :warn or :error
|
10
10
|
cfg.log_level = :info
|
11
|
-
|
11
|
+
|
12
12
|
# provide one or more URLs for the crawler to start from
|
13
13
|
cfg.crawler.start_urls = [ 'http://www.example.com/' ]
|
14
14
|
|
@@ -26,15 +26,19 @@ RDig.configuration do |cfg|
|
|
26
26
|
# caution, existing contents of this directory will be deleted!
|
27
27
|
cfg.index.path = '/path/to/index'
|
28
28
|
|
29
|
+
# Set the analyzer to use.
|
30
|
+
# See the Ferret::Analysis package rdoc for more info about analyzers.
|
31
|
+
# cfg.index.analyzer = Ferret::Analysis::StandardAnalyzer.new
|
32
|
+
|
29
33
|
##################################################################
|
30
34
|
# options you might want to set, the given values are the defaults
|
31
35
|
|
32
36
|
# set to true to get stack traces on errors
|
33
37
|
# cfg.verbose = false
|
34
|
-
|
38
|
+
|
35
39
|
# content extraction options
|
36
40
|
cfg.content_extraction = OpenStruct.new(
|
37
|
-
|
41
|
+
|
38
42
|
# HPRICOT configuration
|
39
43
|
# hpricot is the html parsing lib used by RDig. See
|
40
44
|
# http://code.whytheluckystiff.net/hpricot for usage information.
|
@@ -52,11 +56,11 @@ RDig.configuration do |cfg|
|
|
52
56
|
)
|
53
57
|
|
54
58
|
# crawler options
|
55
|
-
|
59
|
+
|
56
60
|
# Notice: for file system crawling the include/exclude_document patterns are
|
57
61
|
# applied to the full path of _files_ only (like /home/bob/test.pdf),
|
58
62
|
# for http to full URIs (like http://example.com/index.html).
|
59
|
-
|
63
|
+
|
60
64
|
# nil (include all documents) or an array of Regexps
|
61
65
|
# matching the URLs you want to index.
|
62
66
|
# cfg.crawler.include_documents = nil
|
@@ -67,7 +71,7 @@ RDig.configuration do |cfg|
|
|
67
71
|
# to exclude documents here that aren't wanted but would be
|
68
72
|
# included by the inclusion patterns.
|
69
73
|
# cfg.crawler.exclude_documents = nil
|
70
|
-
|
74
|
+
|
71
75
|
# number of document fetching threads to use. Should be raised only if
|
72
76
|
# your CPU has idle time when indexing.
|
73
77
|
# cfg.crawler.num_threads = 2
|
@@ -85,14 +89,14 @@ RDig.configuration do |cfg|
|
|
85
89
|
# limit the crawling depth. Default: nil (unlimited)
|
86
90
|
# Set to 0 to only index the start_urls.
|
87
91
|
# cfg.crawler.max_depth = nil
|
88
|
-
|
92
|
+
|
89
93
|
# default index document to be appended to URIs ending with a trailing '/'
|
90
94
|
# cfg.crawler.normalize_uri.index_document = nil
|
91
95
|
# strip trailing '/' from URIs to avoid double indexing of pages referred by '
|
92
96
|
# Ignored if index_document is set.
|
93
97
|
# Not necessary when the server issues proper etags since the default etag filter will kill these doublettes.
|
94
98
|
# cfg.crawler.normalize_uri.remove_trailing_slash = nil
|
95
|
-
|
99
|
+
|
96
100
|
# http proxy configuration
|
97
101
|
# proxy url
|
98
102
|
# cfg.crawler.http_proxy = nil
|
@@ -107,6 +111,18 @@ RDig.configuration do |cfg|
|
|
107
111
|
|
108
112
|
# indexer options
|
109
113
|
|
114
|
+
# RDig defaults to AND queries:
|
115
|
+
# "foo bar" -> foo AND bar
|
116
|
+
# "foo OR bar" -> foo OR bar
|
117
|
+
# "foo AND bar" -> foo AND bar
|
118
|
+
# cfg.index.or_default = false
|
119
|
+
|
120
|
+
# set to true to get the default ferret behaviour:
|
121
|
+
# "foo bar" -> foo OR bar
|
122
|
+
# "foo OR bar" -> foo OR bar
|
123
|
+
# "foo AND bar" -> foo AND bar
|
124
|
+
# cfg.index.or_default = true
|
125
|
+
|
110
126
|
# create a new index on each run. Will append to the index if false. Use when
|
111
127
|
# building a single index from multiple runs, e.g. one across a website and the
|
112
128
|
# other a tree in a local file system
|
@@ -121,5 +137,5 @@ RDig.configuration do |cfg|
|
|
121
137
|
# uri.scheme = 'http'
|
122
138
|
# uri.host = 'www.mydomain.com'
|
123
139
|
# }
|
124
|
-
|
140
|
+
|
125
141
|
end
|
data/lib/rdig/search.rb
CHANGED
@@ -9,7 +9,7 @@ module RDig
|
|
9
9
|
# the query parser used to parse query strings
|
10
10
|
attr_reader :query_parser
|
11
11
|
|
12
|
-
# takes the
|
12
|
+
# takes the index section of the rdig configuration as a parameter.
|
13
13
|
def initialize(settings)
|
14
14
|
@ferret_config = settings
|
15
15
|
@query_parser = Ferret::QueryParser.new(settings.marshal_dump)
|
@@ -37,38 +37,64 @@ module RDig
|
|
37
37
|
0
|
38
38
|
end
|
39
39
|
|
40
|
-
# run a search.
|
40
|
+
# run a search.
|
41
|
+
#
|
41
42
|
# +query+ usually will be a user-entered string. See the Ferret query
|
42
43
|
# language[http://ferret.davebalmain.com/api/classes/Ferret/QueryParser.html]
|
43
44
|
# for more information on queries.
|
44
45
|
# A Ferret::Search::Query instance may be given, too.
|
45
|
-
#
|
46
|
+
#
|
46
47
|
# Some of the more often used otions are:
|
47
48
|
# offset:: first document in result list to retrieve (0-based). The default is 0.
|
48
49
|
# limit:: number of documents to retrieve. The default is 10.
|
50
|
+
# highlight:: hash to configure excerpt highlighting, e.g.
|
51
|
+
# :highlight => { :pre_tag => '<b>',
|
52
|
+
# :post_tag => '</b>',
|
53
|
+
# :ellipsis => '…',
|
54
|
+
# :excerpt_length => 50,
|
55
|
+
# :num_excerpts => 3 }
|
56
|
+
# You may just set :highlight => true to go with the defaults, or use a hash to
|
57
|
+
# override those default values.
|
58
|
+
#
|
49
59
|
# Please see the Ferret::Search::Searcher API for more options.
|
50
60
|
def search(query, options={})
|
51
61
|
result = {}
|
52
|
-
query =
|
62
|
+
query = process_query query
|
53
63
|
RDig.logger.info "Query: #{query}"
|
54
64
|
results = []
|
55
65
|
searcher = ferret_searcher
|
56
66
|
maximum_score = get_maximum_score query, options
|
57
67
|
result[:hitcount] = searcher.search_each(query, options) do |doc_id, score|
|
58
68
|
doc = searcher[doc_id]
|
59
|
-
results << { :score => score,
|
60
|
-
:title => doc[:title],
|
61
|
-
:url => doc[:url],
|
69
|
+
results << { :score => score,
|
70
|
+
:title => doc[:title],
|
71
|
+
:url => doc[:url],
|
62
72
|
:extract => build_extract(doc[:data]),
|
63
|
-
:relative_score => (score / maximum_score)
|
73
|
+
:relative_score => (score / maximum_score),
|
74
|
+
:doc_id => doc_id
|
64
75
|
}
|
65
76
|
end
|
77
|
+
if highlight_opts = options[:highlight]
|
78
|
+
highlight_opts = { :pre_tag => '<b>',
|
79
|
+
:post_tag => '</b>',
|
80
|
+
:ellipsis => '…',
|
81
|
+
:excerpt_length => 50,
|
82
|
+
:num_excerpts => 3 }.merge(Hash === highlight_opts ? highlight_opts : {})
|
83
|
+
results.each do |r|
|
84
|
+
r[:extract] = searcher.highlight(query, r[:doc_id], :data, highlight_opts)
|
85
|
+
end
|
86
|
+
end
|
66
87
|
result[:list] = results
|
67
88
|
result
|
68
89
|
end
|
69
90
|
|
91
|
+
def process_query(query)
|
92
|
+
query = query_parser.parse(query) if query.is_a?(String)
|
93
|
+
return query
|
94
|
+
end
|
95
|
+
|
70
96
|
def build_extract(data)
|
71
|
-
(data && data.length > 200) ? data[0..200] : data
|
97
|
+
(data && data.length > 200) ? data[0..200] : data
|
72
98
|
end
|
73
99
|
|
74
100
|
end
|
data/lib/rdig.rb
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
#--
|
4
4
|
# Copyright (c) 2006-2012 Jens Kraemer, jk@jkraemer.net
|
5
|
-
#
|
5
|
+
#
|
6
6
|
# Permission is hereby granted, free of charge, to any person obtaining
|
7
7
|
# a copy of this software and associated documentation files (the
|
8
8
|
# "Software"), to deal in the Software without restriction, including
|
@@ -10,10 +10,10 @@
|
|
10
10
|
# distribute, sublicense, and/or sell copies of the Software, and to
|
11
11
|
# permit persons to whom the Software is furnished to do so, subject to
|
12
12
|
# the following conditions:
|
13
|
-
#
|
13
|
+
#
|
14
14
|
# The above copyright notice and this permission notice shall be
|
15
15
|
# included in all copies or substantial portions of the Software.
|
16
|
-
#
|
16
|
+
#
|
17
17
|
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
18
18
|
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
19
19
|
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
@@ -24,7 +24,7 @@
|
|
24
24
|
#++
|
25
25
|
#
|
26
26
|
|
27
|
-
RDIGVERSION = '0.3.
|
27
|
+
RDIGVERSION = '0.3.10'
|
28
28
|
|
29
29
|
|
30
30
|
require 'thread'
|
@@ -143,7 +143,7 @@ module RDig
|
|
143
143
|
:create => true,
|
144
144
|
:handle_parse_errors => true,
|
145
145
|
:analyzer => Ferret::Analysis::StandardAnalyzer.new,
|
146
|
-
:
|
146
|
+
:or_default => false,
|
147
147
|
:default_field => '*'
|
148
148
|
)
|
149
149
|
)
|