rdig 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,24 @@
1
+ module RDig
2
+ module Search
3
+
4
+ # beginning of a port of the Query term highlighter from Lucene contrib
5
+ class Highlighter
6
+ def initialize
7
+ @analyzer = RDig.config.ferret.analyzer
8
+ end
9
+ def best_fragments(scorer, text, max_fragments = 1)
10
+ token_stream = @analyzer.token_stream('body', text)
11
+ frag_texts = []
12
+ get_best_text_fragments(token_stream, text, max_fragments).each { |frag|
13
+ frag_texts << frag.to_s if (frag && frag.score > 0)
14
+ }
15
+ return frag_texts
16
+ end
17
+
18
+ def get_best_text_fragments(token_stream, text, max_fragments)
19
+
20
+ end
21
+ end
22
+
23
+ end
24
+ end
@@ -0,0 +1,22 @@
1
+ module RDig
2
+
3
+ module HttpClient
4
+ def do_get(uri, user_agent='RDig crawler')
5
+ # Set up the appropriate http headers
6
+ headers = { "User-Agent" => user_agent }
7
+ result = {}
8
+
9
+ begin
10
+ Net::HTTP.start(uri.host, (uri.port or 80)) { |http|
11
+ final_uri = uri.path
12
+ final_uri += ('?' + uri.query) if uri.query
13
+ return http.get(final_uri, headers)
14
+ }
15
+ rescue => error
16
+ puts error
17
+ end
18
+ end
19
+ end
20
+
21
+ end
22
+
data/lib/rdig/index.rb ADDED
@@ -0,0 +1,39 @@
1
+ module RDig
2
+ module Index
3
+
4
+ # used by the crawler to build the ferret index
5
+ class Indexer
6
+ include MonitorMixin, Ferret::Index, Ferret::Document
7
+
8
+ def initialize(settings)
9
+ #@ferret_config = settings
10
+ @index_writer = IndexWriter.new(settings.path,
11
+ :create => settings.create,
12
+ :analyzer => settings.analyzer)
13
+ super() # scary, MonitorMixin won't initialize if we don't call super() here (parens matter)
14
+ end
15
+
16
+ def add_to_index(document)
17
+ puts "add to index: #{document.uri.to_s}"
18
+ doc = Ferret::Document::Document.new
19
+ doc << Field.new("url", document.url,
20
+ Field::Store::YES, Field::Index::UNTOKENIZED)
21
+ doc << Field.new("title", document.title,
22
+ Field::Store::YES, Field::Index::TOKENIZED)
23
+ doc << Field.new("data", document.body,
24
+ Field::Store::YES, Field::Index::TOKENIZED)
25
+ synchronize do
26
+ @index_writer << doc
27
+ end
28
+ end
29
+ alias :<< :add_to_index
30
+
31
+ def close
32
+ @index_writer.optimize
33
+ @index_writer.close
34
+ @index_writer = nil
35
+ end
36
+ end
37
+
38
+ end
39
+ end
@@ -0,0 +1,77 @@
1
+ module RDig
2
+ module Search
3
+
4
+ # This class is used to search the index.
5
+ # Call RDig::searcher to retrieve an instance ready for use.
6
+ class Searcher
7
+ include Ferret::Search
8
+
9
+ # the query parser used to parse query strings
10
+ attr_reader :query_parser
11
+
12
+ # takes the ferret section of the rdig configuration as a parameter.
13
+ def initialize(settings)
14
+ @ferret_config = settings
15
+ @query_parser = Ferret::QueryParser.new('*', settings.marshal_dump)
16
+ ferret_searcher
17
+ end
18
+
19
+ # returns the Ferret::Search::IndexSearcher instance used internally.
20
+ def ferret_searcher
21
+ if @ferret_searcher and !@ferret_searcher.reader.latest?
22
+ # reopen searcher
23
+ @ferret_searcher.close
24
+ @ferret_searcher = nil
25
+ end
26
+ unless @ferret_searcher
27
+ @ferret_searcher = IndexSearcher.new(@ferret_config.path)
28
+ @query_parser.fields = @ferret_searcher.reader.get_field_names.to_a
29
+ end
30
+ @ferret_searcher
31
+ end
32
+
33
+ # run a search.
34
+ # +query+ usually will be a user-entered string. See the Ferret query
35
+ # language[http://ferret.davebalmain.com/api/classes/Ferret/QueryParser.html]
36
+ # for more information on queries.
37
+ # A Ferret::Search::Query instance may be given, too.
38
+ #
39
+ # Otions are:
40
+ # first_doc:: first document in result list to retrieve (0-based). The default is 0.
41
+ # num_docs:: number of documents to retrieve. The default is 10.
42
+ def search(query, options={})
43
+ result = {}
44
+ query = query_parser.parse(query) if query.is_a?(String)
45
+ puts "Query: #{query}"
46
+ hits = ferret_searcher.search(query, options)
47
+ result[:hitcount] = hits.total_hits
48
+ results = []
49
+ hits.each { |doc_id,score|
50
+ doc = ferret_searcher.reader.get_document doc_id
51
+ results << { :score => score,
52
+ :title => doc['title'],
53
+ :url => doc['url'],
54
+ :extract => build_extract(doc['data']) }
55
+ }
56
+ result[:list] = results
57
+ result
58
+ end
59
+
60
+ def build_extract(data)
61
+ (data && data.length > 200) ? data[0..200] : data
62
+ end
63
+
64
+ end
65
+
66
+ # class SearchResult < OpenStruct
67
+ # def initialize(doc, score)
68
+ # self.score = score
69
+ # self.title = doc[:title]
70
+ # self.url = doc[:url]
71
+ # self.extract = doc[:content][0..200]
72
+ # end
73
+ # end
74
+
75
+
76
+ end
77
+ end
@@ -0,0 +1,171 @@
1
+ module RDig
2
+
3
+ module UrlFilters
4
+
5
+ class FilterChain
6
+ def initialize(chain_config)
7
+ @filters = []
8
+ chain_config.each { |filter|
9
+ case filter
10
+ when Hash
11
+ filter.each_pair { |f, args|
12
+ add(f, args)
13
+ }
14
+ when Array
15
+ args = filter
16
+ filter = args.shift
17
+ add(filter, args)
18
+ else
19
+ add(filter)
20
+ end
21
+ }
22
+ end
23
+
24
+ # add a filter and it's args to the chain
25
+ # when args is a symbol, it is treated as a configuration key
26
+ def add(filter, args=nil)
27
+ args = RDig.config.crawler.send(args) if args.is_a? Symbol
28
+ case filter
29
+ when Symbol
30
+ if args.nil?
31
+ @filters << lambda { |document|
32
+ UrlFilters.send(filter, document)
33
+ }
34
+ else
35
+ @filters << lambda { |document|
36
+ UrlFilters.send(filter, document, args)
37
+ }
38
+ end
39
+ when Class
40
+ if args.nil?
41
+ if filter.respond_to?(:instance)
42
+ filter_instance = filter.instance
43
+ else
44
+ filter_instance = filter.new
45
+ end
46
+ else
47
+ filter_instance = filter.new(args)
48
+ end
49
+ @filters << lambda { |document|
50
+ filter_instance.apply(document)
51
+ }
52
+ end
53
+ end
54
+
55
+ def apply(document)
56
+ @filters.each { |filter|
57
+ return nil unless filter.call(document)
58
+ }
59
+ return document
60
+ end
61
+ end
62
+
63
+ # takes care of a list of all Urls visited during a crawl, to avoid
64
+ # indexing pages more than once
65
+ # implemented as a thread safe singleton as it has to be shared
66
+ # between all crawler threads
67
+ class VisitedUrlFilter
68
+ include MonitorMixin, Singleton
69
+ def initialize
70
+ @visited_urls = Set.new
71
+ super
72
+ end
73
+
74
+ # return document if this document's url has not been visited yet,
75
+ # nil otherwise
76
+ def apply(document)
77
+ synchronize do
78
+ @visited_urls.add?(document.uri.to_s) ? document : nil
79
+ end
80
+ end
81
+ end
82
+
83
+
84
+ # base class for url inclusion / exclusion filters
85
+ class UrlPatternFilter
86
+ # takes an Array of Regexps, or nil to disable the filter
87
+ def initialize(args=nil)
88
+ unless args.nil?
89
+ @patterns = []
90
+ if args.respond_to? :each
91
+ args.each { |pattern|
92
+ # cloning because unsure if regexps are thread safe...
93
+ @patterns << pattern.clone
94
+ }
95
+ else
96
+ @patterns << args.clone
97
+ end
98
+ end
99
+ end
100
+ end
101
+ class UrlExclusionFilter < UrlPatternFilter
102
+ # returns nil if any of the patterns matches it's URL,
103
+ # the document itself otherwise
104
+ def apply(document)
105
+ return document unless @patterns
106
+ @patterns.each { |p|
107
+ return nil if document.uri.to_s =~ p
108
+ }
109
+ return document
110
+ end
111
+ end
112
+ class UrlInclusionFilter < UrlPatternFilter
113
+ # returns nil if any of the patterns matches it's URL,
114
+ # the document itself otherwise
115
+ def apply(document)
116
+ return document unless @patterns
117
+ @patterns.each { |p|
118
+ return document if document.uri.to_s =~ p
119
+ }
120
+ return nil
121
+ end
122
+ end
123
+
124
+
125
+
126
+
127
+ # checks redirect count of the given document
128
+ # takes it out of the chain if number of redirections exceeds the
129
+ # max_redirects setting
130
+ def UrlFilters.maximum_redirect_filter(document, max_redirects)
131
+ return nil if document.redirections > max_redirects
132
+ return document
133
+ end
134
+
135
+ # expands both href="/path/xyz.html" and href="affe.html"
136
+ # to full urls
137
+ def UrlFilters.fix_relative_uri(document)
138
+ return nil unless document.uri.scheme.nil? || document.uri.scheme =~ /^http/i
139
+ ref = document.referring_uri
140
+ return document unless ref
141
+ uri = document.uri
142
+ uri.scheme = ref.scheme unless uri.scheme
143
+ uri.host = ref.host unless uri.host
144
+ uri.port = ref.port unless uri.port || ref.port==ref.default_port
145
+ uri.path = ref.path unless uri.path
146
+
147
+ if uri.path !~ /^\//
148
+ ref_path = ref.path || '/'
149
+ ref_path << '/' if ref_path.empty?
150
+ uri.path = ref_path[0..ref_path.rindex('/')] + uri.path
151
+ end
152
+ return document
153
+ end
154
+
155
+ def UrlFilters.hostname_filter(document, include_hosts)
156
+ return document if include_hosts.include?(document.uri.host)
157
+ return nil
158
+ end
159
+
160
+ def UrlFilters.normalize_uri(document)
161
+ document.uri.fragment = nil
162
+ # document.uri.query = nil
163
+ # append index document if configured and path ends with a slash
164
+ if RDig.config.index_document && document.uri.path =~ /\/$/
165
+ document.uri.path << RDig.config.index_document
166
+ end
167
+ return document
168
+ end
169
+
170
+ end
171
+ end
data/rakefile ADDED
@@ -0,0 +1,325 @@
1
+ # rakefile for RDig.
2
+ # large parts borrowed from rake's Rakefile
3
+
4
+ begin
5
+ require 'rubygems'
6
+ require 'rake/gempackagetask'
7
+ rescue Exception
8
+ nil
9
+ end
10
+ require 'rake'
11
+ require 'rake/testtask'
12
+ require 'rake/rdoctask'
13
+ require 'rake/packagetask'
14
+ require 'rake/contrib/rubyforgepublisher'
15
+
16
+ def announce(msg='')
17
+ STDERR.puts msg
18
+ end
19
+
20
+
21
+ PKG_NAME = 'rdig'
22
+
23
+ # Determine the current version of the software
24
+ if `ruby -Ilib ./bin/rdig --version` =~ /rdig, version ([0-9.]+)$/
25
+ CURRENT_VERSION = $1
26
+ else
27
+ CURRENT_VERSION = "0.0.0"
28
+ end
29
+
30
+ if ENV['REL']
31
+ PKG_VERSION = ENV['REL']
32
+ else
33
+ PKG_VERSION = CURRENT_VERSION
34
+ end
35
+
36
+ SRC_RB = FileList['lib/**/*.rb']
37
+
38
+ PKG_FILE_NAME = "#{PKG_NAME}-#{PKG_VERSION}"
39
+
40
+ RELEASE_NAME = "REL #{PKG_VERSION}"
41
+
42
+ RUBY_FORGE_PROJECT = "rdig"
43
+ RUBY_FORGE_USER = "jkraemer"
44
+
45
+ PKG_FILES = FileList[
46
+ "bin/**/*",
47
+ "lib/**/*",
48
+ "test/**/*",
49
+ "doc/**/*",
50
+ "[A-Z]*",
51
+ "install.rb",
52
+ "rakefile"
53
+ ].exclude(/\.svn|~$|\.swp$/)
54
+
55
+
56
+ desc "Default Task"
57
+ task :default => [ :test_all ]
58
+
59
+ # Test Tasks -------------------------------------------------------------
60
+
61
+ task :ta => :test_all
62
+ task :tf => :test_functional
63
+ task :tu => :test_units
64
+
65
+ # Run all tests
66
+ Rake::TestTask.new("test_all") { |t|
67
+ t.test_files = FileList[
68
+ 'test/unit/*_test.rb',
69
+ 'test/functional/*_test.rb'
70
+ ]
71
+ t.libs << "test"
72
+ #t.warning = true
73
+ t.verbose = true
74
+ }
75
+
76
+ # Run unit tests
77
+ Rake::TestTask.new("test_units") { |t|
78
+ t.test_files = FileList[ 'test/unit/*_test.rb' ]
79
+ t.libs << "test"
80
+ #t.warning = true
81
+ t.verbose = true
82
+ }
83
+
84
+ # Run functional tests
85
+ Rake::TestTask.new("test_functional") { |t|
86
+ t.test_files = FileList[ 'test/functional/*_test.rb' ]
87
+ t.libs << "test"
88
+ #t.warning = true
89
+ t.verbose = true
90
+ }
91
+
92
+
93
+
94
+ # Generate the RDoc documentation ----------------------------------------
95
+
96
+ rd = Rake::RDocTask.new { |rdoc|
97
+ rdoc.rdoc_dir = 'doc/html'
98
+ rdoc.title = "RDig - Ferret based full text search for web sites"
99
+ rdoc.options << '--line-numbers' << '--inline-source'
100
+ rdoc.options << '--main' << 'README'
101
+ rdoc.template = "#{ENV['template']}.rb" if ENV['template']
102
+ rdoc.rdoc_files.include('README', 'CHANGES', 'LICENSE', 'TODO')
103
+ rdoc.rdoc_files.include('lib/**/*.rb')
104
+ }
105
+
106
+
107
+ # packaging --------------------------------------------------------------
108
+
109
+ # ====================================================================
110
+ # Create a task that will package the software into distributable
111
+ # tar, zip and gem files.
112
+
113
+ if ! defined?(Gem)
114
+ puts "Package Target requires RubyGEMs"
115
+ else
116
+ spec = Gem::Specification.new do |s|
117
+
118
+ #### Basic information.
119
+
120
+ s.name = 'rdig'
121
+ s.version = PKG_VERSION
122
+ s.summary = "Ruby based web site indexing and searching library."
123
+ s.description = <<-EOF
124
+ RDig provides an HTTP crawler and content extraction utilities
125
+ to help building a site search for web sites or intranets. Internally,
126
+ Ferret is used for the full text indexing. After creating a config file
127
+ for your site, the index can be built with a single call to rdig.
128
+ EOF
129
+
130
+ #### Dependencies and requirements.
131
+
132
+ s.add_dependency('ferret', '>= 0.3.2')
133
+ s.add_dependency('rubyful_soup', '>= 1.0.4')
134
+ #s.requirements << ""
135
+
136
+ #### Which files are to be included in this gem? Everything! (Except CVS directories.)
137
+
138
+ s.files = PKG_FILES.to_a
139
+
140
+ #### Load-time details: library and application (you will need one or both).
141
+
142
+ s.require_path = 'lib' # Use these for libraries.
143
+ s.bindir = "bin" # Use these for applications.
144
+ s.executables = ["rdig"]
145
+ s.default_executable = "rdig"
146
+
147
+ #### Documentation and testing.
148
+
149
+ s.has_rdoc = true
150
+ s.extra_rdoc_files = rd.rdoc_files.reject { |fn| fn =~ /\.rb$/ }.to_a
151
+ s.rdoc_options <<
152
+ '--title' << 'Rake -- Ruby Make' <<
153
+ '--main' << 'README' <<
154
+ '--line-numbers'
155
+
156
+ #### Author and project details.
157
+
158
+ s.author = "Jens Kraemer"
159
+ s.email = "jk@jkraemer.net"
160
+ s.homepage = "http://rdig.rubyforge.org/"
161
+ s.rubyforge_project = "rdig"
162
+ # if ENV['CERT_DIR']
163
+ # s.signing_key = File.join(ENV['CERT_DIR'], 'gem-private_key.pem')
164
+ # s.cert_chain = [File.join(ENV['CERT_DIR'], 'gem-public_cert.pem')]
165
+ # end
166
+ end
167
+
168
+ package_task = Rake::GemPackageTask.new(spec) do |pkg|
169
+ pkg.need_zip = true
170
+ pkg.need_tar = true
171
+ end
172
+ end
173
+
174
+
175
+
176
+ # misc ----------------------------------------------------------------
177
+
178
+ def count_lines(filename)
179
+ lines = 0
180
+ codelines = 0
181
+ open(filename) { |f|
182
+ f.each do |line|
183
+ lines += 1
184
+ next if line =~ /^\s*$/
185
+ next if line =~ /^\s*#/
186
+ codelines += 1
187
+ end
188
+ }
189
+ [lines, codelines]
190
+ end
191
+
192
+ def show_line(msg, lines, loc)
193
+ printf "%6s %6s %s\n", lines.to_s, loc.to_s, msg
194
+ end
195
+
196
+ desc "Count lines in the main rake file"
197
+ task :lines do
198
+ total_lines = 0
199
+ total_code = 0
200
+ show_line("File Name", "LINES", "LOC")
201
+ SRC_RB.each do |fn|
202
+ lines, codelines = count_lines(fn)
203
+ show_line(fn, lines, codelines)
204
+ total_lines += lines
205
+ total_code += codelines
206
+ end
207
+ show_line("TOTAL", total_lines, total_code)
208
+ end
209
+
210
+ # Define an optional publish target in an external file. If the
211
+ # publish.rf file is not found, the publish targets won't be defined.
212
+
213
+ load "publish.rf" if File.exist? "publish.rf"
214
+
215
+
216
+ # Support Tasks ------------------------------------------------------
217
+
218
+ desc "Look for TODO and FIXME tags in the code"
219
+ task :todo do
220
+ FileList['**/*.rb'].exclude('pkg').egrep /#.*(FIXME|TODO|TBD)/
221
+ end
222
+
223
+ desc "Look for Debugging print lines"
224
+ task :dbg do
225
+ FileList['**/*.rb'].egrep /\bDBG|\bbreakpoint\b/
226
+ end
227
+
228
+ desc "List all ruby files"
229
+ task :rubyfiles do
230
+ puts Dir['**/*.rb'].reject { |fn| fn =~ /^pkg/ }
231
+ puts Dir['bin/*'].reject { |fn| fn =~ /CVS|(~$)|(\.rb$)/ }
232
+ end
233
+ task :rf => :rubyfiles
234
+
235
+
236
+ # --------------------------------------------------------------------
237
+ # Creating a release
238
+
239
+ desc "Make a new release"
240
+ task :release => [
241
+ :prerelease,
242
+ :clobber,
243
+ :test_all,
244
+ :update_version,
245
+ :package,
246
+ :tag] do
247
+
248
+ announce
249
+ announce "**************************************************************"
250
+ announce "* Release #{PKG_VERSION} Complete."
251
+ announce "* Packages ready to upload."
252
+ announce "**************************************************************"
253
+ announce
254
+ end
255
+
256
+ # Validate that everything is ready to go for a release.
257
+ task :prerelease do
258
+ announce
259
+ announce "**************************************************************"
260
+ announce "* Making RubyGem Release #{PKG_VERSION}"
261
+ announce "* (current version #{CURRENT_VERSION})"
262
+ announce "**************************************************************"
263
+ announce
264
+
265
+ # Is a release number supplied?
266
+ unless ENV['REL']
267
+ fail "Usage: rake release REL=x.y.z [REUSE=tag_suffix]"
268
+ end
269
+
270
+ # Is the release different than the current release.
271
+ # (or is REUSE set?)
272
+ if PKG_VERSION == CURRENT_VERSION && ! ENV['REUSE']
273
+ fail "Current version is #{PKG_VERSION}, must specify REUSE=tag_suffix to reuse version"
274
+ end
275
+
276
+ # Are all source files checked in?
277
+ if ENV['RELTEST']
278
+ announce "Release Task Testing, skipping checked-in file test"
279
+ else
280
+ announce "Checking for unchecked-in files..."
281
+ data = `svn st`
282
+ unless data =~ /^$/
283
+ fail "SVN status is not clean ... do you have unchecked-in files?"
284
+ end
285
+ announce "No outstanding checkins found ... OK"
286
+ end
287
+ end
288
+
289
+ task :update_version => [:prerelease] do
290
+ if PKG_VERSION == CURRENT_VERSION
291
+ announce "No version change ... skipping version update"
292
+ else
293
+ announce "Updating RDig version to #{PKG_VERSION}"
294
+ open("lib/rdig.rb") do |rakein|
295
+ open("lib/rdig.rb.new", "w") do |rakeout|
296
+ rakein.each do |line|
297
+ if line =~ /^RDIGVERSION\s*=\s*/
298
+ rakeout.puts "RDIGVERSION = '#{PKG_VERSION}'"
299
+ else
300
+ rakeout.puts line
301
+ end
302
+ end
303
+ end
304
+ end
305
+ mv "lib/rdig.rb.new", "lib/rdig.rb"
306
+ if ENV['RELTEST']
307
+ announce "Release Task Testing, skipping commiting of new version"
308
+ else
309
+ sh %{svn commit -m "Updated to version #{PKG_VERSION}" lib/rdig.rb}
310
+ end
311
+ end
312
+ end
313
+
314
+ desc "Tag all files with the latest release number (REL=x.y.z)"
315
+ task :tag => [:prerelease] do
316
+ reltag = "REL_#{PKG_VERSION.gsub(/\./, '_')}"
317
+ reltag << ENV['REUSE'].gsub(/\./, '_') if ENV['REUSE']
318
+ announce "Tagging with [#{reltag}]"
319
+ if ENV['RELTEST']
320
+ announce "Release Task Testing, skipping tagging"
321
+ else
322
+ sh %{cd ..; svn copy trunk tags/#{reltag}}
323
+ end
324
+ end
325
+