rdig 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,24 @@
1
+ module RDig
2
+ module Search
3
+
4
+ # beginning of a port of the Query term highlighter from Lucene contrib
5
+ class Highlighter
6
+ def initialize
7
+ @analyzer = RDig.config.ferret.analyzer
8
+ end
9
+ def best_fragments(scorer, text, max_fragments = 1)
10
+ token_stream = @analyzer.token_stream('body', text)
11
+ frag_texts = []
12
+ get_best_text_fragments(token_stream, text, max_fragments).each { |frag|
13
+ frag_texts << frag.to_s if (frag && frag.score > 0)
14
+ }
15
+ return frag_texts
16
+ end
17
+
18
+ def get_best_text_fragments(token_stream, text, max_fragments)
19
+
20
+ end
21
+ end
22
+
23
+ end
24
+ end
@@ -0,0 +1,22 @@
1
+ module RDig
2
+
3
+ module HttpClient
4
+ def do_get(uri, user_agent='RDig crawler')
5
+ # Set up the appropriate http headers
6
+ headers = { "User-Agent" => user_agent }
7
+ result = {}
8
+
9
+ begin
10
+ Net::HTTP.start(uri.host, (uri.port or 80)) { |http|
11
+ final_uri = uri.path
12
+ final_uri += ('?' + uri.query) if uri.query
13
+ return http.get(final_uri, headers)
14
+ }
15
+ rescue => error
16
+ puts error
17
+ end
18
+ end
19
+ end
20
+
21
+ end
22
+
data/lib/rdig/index.rb ADDED
@@ -0,0 +1,39 @@
1
+ module RDig
2
+ module Index
3
+
4
+ # used by the crawler to build the ferret index
5
+ class Indexer
6
+ include MonitorMixin, Ferret::Index, Ferret::Document
7
+
8
+ def initialize(settings)
9
+ #@ferret_config = settings
10
+ @index_writer = IndexWriter.new(settings.path,
11
+ :create => settings.create,
12
+ :analyzer => settings.analyzer)
13
+ super() # scary, MonitorMixin won't initialize if we don't call super() here (parens matter)
14
+ end
15
+
16
+ def add_to_index(document)
17
+ puts "add to index: #{document.uri.to_s}"
18
+ doc = Ferret::Document::Document.new
19
+ doc << Field.new("url", document.url,
20
+ Field::Store::YES, Field::Index::UNTOKENIZED)
21
+ doc << Field.new("title", document.title,
22
+ Field::Store::YES, Field::Index::TOKENIZED)
23
+ doc << Field.new("data", document.body,
24
+ Field::Store::YES, Field::Index::TOKENIZED)
25
+ synchronize do
26
+ @index_writer << doc
27
+ end
28
+ end
29
+ alias :<< :add_to_index
30
+
31
+ def close
32
+ @index_writer.optimize
33
+ @index_writer.close
34
+ @index_writer = nil
35
+ end
36
+ end
37
+
38
+ end
39
+ end
@@ -0,0 +1,77 @@
1
+ module RDig
2
+ module Search
3
+
4
+ # This class is used to search the index.
5
+ # Call RDig::searcher to retrieve an instance ready for use.
6
+ class Searcher
7
+ include Ferret::Search
8
+
9
+ # the query parser used to parse query strings
10
+ attr_reader :query_parser
11
+
12
+ # takes the ferret section of the rdig configuration as a parameter.
13
+ def initialize(settings)
14
+ @ferret_config = settings
15
+ @query_parser = Ferret::QueryParser.new('*', settings.marshal_dump)
16
+ ferret_searcher
17
+ end
18
+
19
+ # returns the Ferret::Search::IndexSearcher instance used internally.
20
+ def ferret_searcher
21
+ if @ferret_searcher and !@ferret_searcher.reader.latest?
22
+ # reopen searcher
23
+ @ferret_searcher.close
24
+ @ferret_searcher = nil
25
+ end
26
+ unless @ferret_searcher
27
+ @ferret_searcher = IndexSearcher.new(@ferret_config.path)
28
+ @query_parser.fields = @ferret_searcher.reader.get_field_names.to_a
29
+ end
30
+ @ferret_searcher
31
+ end
32
+
33
+ # run a search.
34
+ # +query+ usually will be a user-entered string. See the Ferret query
35
+ # language[http://ferret.davebalmain.com/api/classes/Ferret/QueryParser.html]
36
+ # for more information on queries.
37
+ # A Ferret::Search::Query instance may be given, too.
38
+ #
39
+ # Otions are:
40
+ # first_doc:: first document in result list to retrieve (0-based). The default is 0.
41
+ # num_docs:: number of documents to retrieve. The default is 10.
42
+ def search(query, options={})
43
+ result = {}
44
+ query = query_parser.parse(query) if query.is_a?(String)
45
+ puts "Query: #{query}"
46
+ hits = ferret_searcher.search(query, options)
47
+ result[:hitcount] = hits.total_hits
48
+ results = []
49
+ hits.each { |doc_id,score|
50
+ doc = ferret_searcher.reader.get_document doc_id
51
+ results << { :score => score,
52
+ :title => doc['title'],
53
+ :url => doc['url'],
54
+ :extract => build_extract(doc['data']) }
55
+ }
56
+ result[:list] = results
57
+ result
58
+ end
59
+
60
+ def build_extract(data)
61
+ (data && data.length > 200) ? data[0..200] : data
62
+ end
63
+
64
+ end
65
+
66
+ # class SearchResult < OpenStruct
67
+ # def initialize(doc, score)
68
+ # self.score = score
69
+ # self.title = doc[:title]
70
+ # self.url = doc[:url]
71
+ # self.extract = doc[:content][0..200]
72
+ # end
73
+ # end
74
+
75
+
76
+ end
77
+ end
@@ -0,0 +1,171 @@
1
+ module RDig
2
+
3
+ module UrlFilters
4
+
5
+ class FilterChain
6
+ def initialize(chain_config)
7
+ @filters = []
8
+ chain_config.each { |filter|
9
+ case filter
10
+ when Hash
11
+ filter.each_pair { |f, args|
12
+ add(f, args)
13
+ }
14
+ when Array
15
+ args = filter
16
+ filter = args.shift
17
+ add(filter, args)
18
+ else
19
+ add(filter)
20
+ end
21
+ }
22
+ end
23
+
24
+ # add a filter and it's args to the chain
25
+ # when args is a symbol, it is treated as a configuration key
26
+ def add(filter, args=nil)
27
+ args = RDig.config.crawler.send(args) if args.is_a? Symbol
28
+ case filter
29
+ when Symbol
30
+ if args.nil?
31
+ @filters << lambda { |document|
32
+ UrlFilters.send(filter, document)
33
+ }
34
+ else
35
+ @filters << lambda { |document|
36
+ UrlFilters.send(filter, document, args)
37
+ }
38
+ end
39
+ when Class
40
+ if args.nil?
41
+ if filter.respond_to?(:instance)
42
+ filter_instance = filter.instance
43
+ else
44
+ filter_instance = filter.new
45
+ end
46
+ else
47
+ filter_instance = filter.new(args)
48
+ end
49
+ @filters << lambda { |document|
50
+ filter_instance.apply(document)
51
+ }
52
+ end
53
+ end
54
+
55
+ def apply(document)
56
+ @filters.each { |filter|
57
+ return nil unless filter.call(document)
58
+ }
59
+ return document
60
+ end
61
+ end
62
+
63
+ # takes care of a list of all Urls visited during a crawl, to avoid
64
+ # indexing pages more than once
65
+ # implemented as a thread safe singleton as it has to be shared
66
+ # between all crawler threads
67
+ class VisitedUrlFilter
68
+ include MonitorMixin, Singleton
69
+ def initialize
70
+ @visited_urls = Set.new
71
+ super
72
+ end
73
+
74
+ # return document if this document's url has not been visited yet,
75
+ # nil otherwise
76
+ def apply(document)
77
+ synchronize do
78
+ @visited_urls.add?(document.uri.to_s) ? document : nil
79
+ end
80
+ end
81
+ end
82
+
83
+
84
+ # base class for url inclusion / exclusion filters
85
+ class UrlPatternFilter
86
+ # takes an Array of Regexps, or nil to disable the filter
87
+ def initialize(args=nil)
88
+ unless args.nil?
89
+ @patterns = []
90
+ if args.respond_to? :each
91
+ args.each { |pattern|
92
+ # cloning because unsure if regexps are thread safe...
93
+ @patterns << pattern.clone
94
+ }
95
+ else
96
+ @patterns << args.clone
97
+ end
98
+ end
99
+ end
100
+ end
101
+ class UrlExclusionFilter < UrlPatternFilter
102
+ # returns nil if any of the patterns matches it's URL,
103
+ # the document itself otherwise
104
+ def apply(document)
105
+ return document unless @patterns
106
+ @patterns.each { |p|
107
+ return nil if document.uri.to_s =~ p
108
+ }
109
+ return document
110
+ end
111
+ end
112
+ class UrlInclusionFilter < UrlPatternFilter
113
+ # returns nil if any of the patterns matches it's URL,
114
+ # the document itself otherwise
115
+ def apply(document)
116
+ return document unless @patterns
117
+ @patterns.each { |p|
118
+ return document if document.uri.to_s =~ p
119
+ }
120
+ return nil
121
+ end
122
+ end
123
+
124
+
125
+
126
+
127
+ # checks redirect count of the given document
128
+ # takes it out of the chain if number of redirections exceeds the
129
+ # max_redirects setting
130
+ def UrlFilters.maximum_redirect_filter(document, max_redirects)
131
+ return nil if document.redirections > max_redirects
132
+ return document
133
+ end
134
+
135
+ # expands both href="/path/xyz.html" and href="affe.html"
136
+ # to full urls
137
+ def UrlFilters.fix_relative_uri(document)
138
+ return nil unless document.uri.scheme.nil? || document.uri.scheme =~ /^http/i
139
+ ref = document.referring_uri
140
+ return document unless ref
141
+ uri = document.uri
142
+ uri.scheme = ref.scheme unless uri.scheme
143
+ uri.host = ref.host unless uri.host
144
+ uri.port = ref.port unless uri.port || ref.port==ref.default_port
145
+ uri.path = ref.path unless uri.path
146
+
147
+ if uri.path !~ /^\//
148
+ ref_path = ref.path || '/'
149
+ ref_path << '/' if ref_path.empty?
150
+ uri.path = ref_path[0..ref_path.rindex('/')] + uri.path
151
+ end
152
+ return document
153
+ end
154
+
155
+ def UrlFilters.hostname_filter(document, include_hosts)
156
+ return document if include_hosts.include?(document.uri.host)
157
+ return nil
158
+ end
159
+
160
+ def UrlFilters.normalize_uri(document)
161
+ document.uri.fragment = nil
162
+ # document.uri.query = nil
163
+ # append index document if configured and path ends with a slash
164
+ if RDig.config.index_document && document.uri.path =~ /\/$/
165
+ document.uri.path << RDig.config.index_document
166
+ end
167
+ return document
168
+ end
169
+
170
+ end
171
+ end
data/rakefile ADDED
@@ -0,0 +1,325 @@
1
+ # rakefile for RDig.
2
+ # large parts borrowed from rake's Rakefile
3
+
4
+ begin
5
+ require 'rubygems'
6
+ require 'rake/gempackagetask'
7
+ rescue Exception
8
+ nil
9
+ end
10
+ require 'rake'
11
+ require 'rake/testtask'
12
+ require 'rake/rdoctask'
13
+ require 'rake/packagetask'
14
+ require 'rake/contrib/rubyforgepublisher'
15
+
16
+ def announce(msg='')
17
+ STDERR.puts msg
18
+ end
19
+
20
+
21
+ PKG_NAME = 'rdig'
22
+
23
+ # Determine the current version of the software
24
+ if `ruby -Ilib ./bin/rdig --version` =~ /rdig, version ([0-9.]+)$/
25
+ CURRENT_VERSION = $1
26
+ else
27
+ CURRENT_VERSION = "0.0.0"
28
+ end
29
+
30
+ if ENV['REL']
31
+ PKG_VERSION = ENV['REL']
32
+ else
33
+ PKG_VERSION = CURRENT_VERSION
34
+ end
35
+
36
+ SRC_RB = FileList['lib/**/*.rb']
37
+
38
+ PKG_FILE_NAME = "#{PKG_NAME}-#{PKG_VERSION}"
39
+
40
+ RELEASE_NAME = "REL #{PKG_VERSION}"
41
+
42
+ RUBY_FORGE_PROJECT = "rdig"
43
+ RUBY_FORGE_USER = "jkraemer"
44
+
45
+ PKG_FILES = FileList[
46
+ "bin/**/*",
47
+ "lib/**/*",
48
+ "test/**/*",
49
+ "doc/**/*",
50
+ "[A-Z]*",
51
+ "install.rb",
52
+ "rakefile"
53
+ ].exclude(/\.svn|~$|\.swp$/)
54
+
55
+
56
+ desc "Default Task"
57
+ task :default => [ :test_all ]
58
+
59
+ # Test Tasks -------------------------------------------------------------
60
+
61
+ task :ta => :test_all
62
+ task :tf => :test_functional
63
+ task :tu => :test_units
64
+
65
+ # Run all tests
66
+ Rake::TestTask.new("test_all") { |t|
67
+ t.test_files = FileList[
68
+ 'test/unit/*_test.rb',
69
+ 'test/functional/*_test.rb'
70
+ ]
71
+ t.libs << "test"
72
+ #t.warning = true
73
+ t.verbose = true
74
+ }
75
+
76
+ # Run unit tests
77
+ Rake::TestTask.new("test_units") { |t|
78
+ t.test_files = FileList[ 'test/unit/*_test.rb' ]
79
+ t.libs << "test"
80
+ #t.warning = true
81
+ t.verbose = true
82
+ }
83
+
84
+ # Run functional tests
85
+ Rake::TestTask.new("test_functional") { |t|
86
+ t.test_files = FileList[ 'test/functional/*_test.rb' ]
87
+ t.libs << "test"
88
+ #t.warning = true
89
+ t.verbose = true
90
+ }
91
+
92
+
93
+
94
+ # Generate the RDoc documentation ----------------------------------------
95
+
96
+ rd = Rake::RDocTask.new { |rdoc|
97
+ rdoc.rdoc_dir = 'doc/html'
98
+ rdoc.title = "RDig - Ferret based full text search for web sites"
99
+ rdoc.options << '--line-numbers' << '--inline-source'
100
+ rdoc.options << '--main' << 'README'
101
+ rdoc.template = "#{ENV['template']}.rb" if ENV['template']
102
+ rdoc.rdoc_files.include('README', 'CHANGES', 'LICENSE', 'TODO')
103
+ rdoc.rdoc_files.include('lib/**/*.rb')
104
+ }
105
+
106
+
107
+ # packaging --------------------------------------------------------------
108
+
109
+ # ====================================================================
110
+ # Create a task that will package the software into distributable
111
+ # tar, zip and gem files.
112
+
113
+ if ! defined?(Gem)
114
+ puts "Package Target requires RubyGEMs"
115
+ else
116
+ spec = Gem::Specification.new do |s|
117
+
118
+ #### Basic information.
119
+
120
+ s.name = 'rdig'
121
+ s.version = PKG_VERSION
122
+ s.summary = "Ruby based web site indexing and searching library."
123
+ s.description = <<-EOF
124
+ RDig provides an HTTP crawler and content extraction utilities
125
+ to help building a site search for web sites or intranets. Internally,
126
+ Ferret is used for the full text indexing. After creating a config file
127
+ for your site, the index can be built with a single call to rdig.
128
+ EOF
129
+
130
+ #### Dependencies and requirements.
131
+
132
+ s.add_dependency('ferret', '>= 0.3.2')
133
+ s.add_dependency('rubyful_soup', '>= 1.0.4')
134
+ #s.requirements << ""
135
+
136
+ #### Which files are to be included in this gem? Everything! (Except CVS directories.)
137
+
138
+ s.files = PKG_FILES.to_a
139
+
140
+ #### Load-time details: library and application (you will need one or both).
141
+
142
+ s.require_path = 'lib' # Use these for libraries.
143
+ s.bindir = "bin" # Use these for applications.
144
+ s.executables = ["rdig"]
145
+ s.default_executable = "rdig"
146
+
147
+ #### Documentation and testing.
148
+
149
+ s.has_rdoc = true
150
+ s.extra_rdoc_files = rd.rdoc_files.reject { |fn| fn =~ /\.rb$/ }.to_a
151
+ s.rdoc_options <<
152
+ '--title' << 'Rake -- Ruby Make' <<
153
+ '--main' << 'README' <<
154
+ '--line-numbers'
155
+
156
+ #### Author and project details.
157
+
158
+ s.author = "Jens Kraemer"
159
+ s.email = "jk@jkraemer.net"
160
+ s.homepage = "http://rdig.rubyforge.org/"
161
+ s.rubyforge_project = "rdig"
162
+ # if ENV['CERT_DIR']
163
+ # s.signing_key = File.join(ENV['CERT_DIR'], 'gem-private_key.pem')
164
+ # s.cert_chain = [File.join(ENV['CERT_DIR'], 'gem-public_cert.pem')]
165
+ # end
166
+ end
167
+
168
+ package_task = Rake::GemPackageTask.new(spec) do |pkg|
169
+ pkg.need_zip = true
170
+ pkg.need_tar = true
171
+ end
172
+ end
173
+
174
+
175
+
176
+ # misc ----------------------------------------------------------------
177
+
178
+ def count_lines(filename)
179
+ lines = 0
180
+ codelines = 0
181
+ open(filename) { |f|
182
+ f.each do |line|
183
+ lines += 1
184
+ next if line =~ /^\s*$/
185
+ next if line =~ /^\s*#/
186
+ codelines += 1
187
+ end
188
+ }
189
+ [lines, codelines]
190
+ end
191
+
192
+ def show_line(msg, lines, loc)
193
+ printf "%6s %6s %s\n", lines.to_s, loc.to_s, msg
194
+ end
195
+
196
+ desc "Count lines in the main rake file"
197
+ task :lines do
198
+ total_lines = 0
199
+ total_code = 0
200
+ show_line("File Name", "LINES", "LOC")
201
+ SRC_RB.each do |fn|
202
+ lines, codelines = count_lines(fn)
203
+ show_line(fn, lines, codelines)
204
+ total_lines += lines
205
+ total_code += codelines
206
+ end
207
+ show_line("TOTAL", total_lines, total_code)
208
+ end
209
+
210
+ # Define an optional publish target in an external file. If the
211
+ # publish.rf file is not found, the publish targets won't be defined.
212
+
213
+ load "publish.rf" if File.exist? "publish.rf"
214
+
215
+
216
+ # Support Tasks ------------------------------------------------------
217
+
218
+ desc "Look for TODO and FIXME tags in the code"
219
+ task :todo do
220
+ FileList['**/*.rb'].exclude('pkg').egrep /#.*(FIXME|TODO|TBD)/
221
+ end
222
+
223
+ desc "Look for Debugging print lines"
224
+ task :dbg do
225
+ FileList['**/*.rb'].egrep /\bDBG|\bbreakpoint\b/
226
+ end
227
+
228
+ desc "List all ruby files"
229
+ task :rubyfiles do
230
+ puts Dir['**/*.rb'].reject { |fn| fn =~ /^pkg/ }
231
+ puts Dir['bin/*'].reject { |fn| fn =~ /CVS|(~$)|(\.rb$)/ }
232
+ end
233
+ task :rf => :rubyfiles
234
+
235
+
236
+ # --------------------------------------------------------------------
237
+ # Creating a release
238
+
239
+ desc "Make a new release"
240
+ task :release => [
241
+ :prerelease,
242
+ :clobber,
243
+ :test_all,
244
+ :update_version,
245
+ :package,
246
+ :tag] do
247
+
248
+ announce
249
+ announce "**************************************************************"
250
+ announce "* Release #{PKG_VERSION} Complete."
251
+ announce "* Packages ready to upload."
252
+ announce "**************************************************************"
253
+ announce
254
+ end
255
+
256
+ # Validate that everything is ready to go for a release.
257
+ task :prerelease do
258
+ announce
259
+ announce "**************************************************************"
260
+ announce "* Making RubyGem Release #{PKG_VERSION}"
261
+ announce "* (current version #{CURRENT_VERSION})"
262
+ announce "**************************************************************"
263
+ announce
264
+
265
+ # Is a release number supplied?
266
+ unless ENV['REL']
267
+ fail "Usage: rake release REL=x.y.z [REUSE=tag_suffix]"
268
+ end
269
+
270
+ # Is the release different than the current release.
271
+ # (or is REUSE set?)
272
+ if PKG_VERSION == CURRENT_VERSION && ! ENV['REUSE']
273
+ fail "Current version is #{PKG_VERSION}, must specify REUSE=tag_suffix to reuse version"
274
+ end
275
+
276
+ # Are all source files checked in?
277
+ if ENV['RELTEST']
278
+ announce "Release Task Testing, skipping checked-in file test"
279
+ else
280
+ announce "Checking for unchecked-in files..."
281
+ data = `svn st`
282
+ unless data =~ /^$/
283
+ fail "SVN status is not clean ... do you have unchecked-in files?"
284
+ end
285
+ announce "No outstanding checkins found ... OK"
286
+ end
287
+ end
288
+
289
+ task :update_version => [:prerelease] do
290
+ if PKG_VERSION == CURRENT_VERSION
291
+ announce "No version change ... skipping version update"
292
+ else
293
+ announce "Updating RDig version to #{PKG_VERSION}"
294
+ open("lib/rdig.rb") do |rakein|
295
+ open("lib/rdig.rb.new", "w") do |rakeout|
296
+ rakein.each do |line|
297
+ if line =~ /^RDIGVERSION\s*=\s*/
298
+ rakeout.puts "RDIGVERSION = '#{PKG_VERSION}'"
299
+ else
300
+ rakeout.puts line
301
+ end
302
+ end
303
+ end
304
+ end
305
+ mv "lib/rdig.rb.new", "lib/rdig.rb"
306
+ if ENV['RELTEST']
307
+ announce "Release Task Testing, skipping commiting of new version"
308
+ else
309
+ sh %{svn commit -m "Updated to version #{PKG_VERSION}" lib/rdig.rb}
310
+ end
311
+ end
312
+ end
313
+
314
+ desc "Tag all files with the latest release number (REL=x.y.z)"
315
+ task :tag => [:prerelease] do
316
+ reltag = "REL_#{PKG_VERSION.gsub(/\./, '_')}"
317
+ reltag << ENV['REUSE'].gsub(/\./, '_') if ENV['REUSE']
318
+ announce "Tagging with [#{reltag}]"
319
+ if ENV['RELTEST']
320
+ announce "Release Task Testing, skipping tagging"
321
+ else
322
+ sh %{cd ..; svn copy trunk tags/#{reltag}}
323
+ end
324
+ end
325
+