rdig 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -18,7 +18,7 @@ RDig.configuration do |cfg|
18
18
 
19
19
  # this is the path where the index will be stored
20
20
  # caution, existing contents of this directory will be deleted!
21
- cfg.indexer.path = '/path/to/index'
21
+ cfg.index.path = '/path/to/index'
22
22
 
23
23
  ##################################################################
24
24
  # options you might want to set, the given values are the defaults
@@ -255,6 +255,8 @@ module RDig
255
255
  content << ' '
256
256
  end
257
257
  elsif element.string # it's a Tag, and it has some content string
258
+ # skip inline scripts and styles
259
+ return nil if element.name =~ /^(script|style)$/i
258
260
  value = element.string.strip
259
261
  unless value.empty?
260
262
  content << value
data/lib/rdig/crawler.rb CHANGED
@@ -76,6 +76,8 @@ module RDig
76
76
  @documents << doc
77
77
  puts "added url #{url}" if RDig::config.verbose
78
78
  end
79
+ rescue
80
+ nil
79
81
  end
80
82
 
81
83
  end
@@ -32,7 +32,7 @@ module RDig
32
32
  begin
33
33
  @uri = URI.parse(args[:url])
34
34
  rescue URI::InvalidURIError
35
- raise "Cannot create document using invalid URL: #{url}"
35
+ raise "Cannot create document using invalid URL: #{args[:url]}"
36
36
  end
37
37
  end
38
38
 
@@ -118,7 +118,7 @@ module RDig
118
118
  @content = ContentExtractors.process(doc.read, doc.content_type)
119
119
  @status = :success
120
120
  when 404
121
- puts "got 404 for #{url}"
121
+ puts "got 404 for #{@uri}"
122
122
  else
123
123
  puts "don't know what to do with response: #{doc.status.join(' : ')}"
124
124
  end
@@ -89,11 +89,11 @@ module RDig
89
89
  @patterns = []
90
90
  if args.respond_to? :each
91
91
  args.each { |pattern|
92
- # cloning because unsure if regexps are thread safe...
93
- @patterns << pattern.clone
92
+ # cloning because unsure if regexps are thread safe ?
93
+ @patterns << pattern #.clone
94
94
  }
95
95
  else
96
- @patterns << args.clone
96
+ @patterns << args #.clone
97
97
  end
98
98
  end
99
99
  end
data/lib/rdig.rb CHANGED
@@ -24,7 +24,7 @@
24
24
  #++
25
25
  #
26
26
 
27
- RDIGVERSION = '0.3.0'
27
+ RDIGVERSION = '0.3.1'
28
28
 
29
29
 
30
30
  require 'thread'
@@ -228,6 +228,7 @@ module RDig
228
228
 
229
229
  # Run the +rdig+ application.
230
230
  def run
231
+ puts "RDig version #{RDIGVERSION}"
231
232
  handle_options
232
233
  begin
233
234
  load_configfile
metadata CHANGED
@@ -1,10 +1,10 @@
1
1
  --- !ruby/object:Gem::Specification
2
- rubygems_version: 0.8.11.15
2
+ rubygems_version: 0.8.11
3
3
  specification_version: 1
4
4
  name: rdig
5
5
  version: !ruby/object:Gem::Version
6
- version: 0.3.0
7
- date: 2006-04-26 00:00:00 +02:00
6
+ version: 0.3.1
7
+ date: 2006-07-26 00:00:00 +02:00
8
8
  summary: Ruby based web site indexing and searching library.
9
9
  require_paths:
10
10
  - lib
@@ -25,50 +25,49 @@ required_ruby_version: !ruby/object:Gem::Version::Requirement
25
25
  platform: ruby
26
26
  signing_key:
27
27
  cert_chain:
28
- post_install_message:
29
28
  authors:
30
29
  - Jens Kraemer
31
30
  files:
32
31
  - bin/rdig
33
32
  - lib/rdig
34
- - lib/htmlentities
35
33
  - lib/rdig.rb
36
- - lib/rdig/crawler.rb
34
+ - lib/htmlentities
35
+ - lib/rdig/documents.rb
36
+ - lib/rdig/file.rb
37
+ - lib/rdig/content_extractors.rb
37
38
  - lib/rdig/search.rb
38
39
  - lib/rdig/highlight.rb
39
40
  - lib/rdig/index.rb
40
41
  - lib/rdig/url_filters.rb
41
- - lib/rdig/content_extractors.rb
42
- - lib/rdig/documents.rb
43
- - lib/rdig/file.rb
42
+ - lib/rdig/crawler.rb
43
+ - lib/htmlentities/htmlentities.rb
44
+ - lib/htmlentities/README
44
45
  - lib/htmlentities/CHANGES
45
46
  - lib/htmlentities/COPYING
46
- - lib/htmlentities/README
47
- - lib/htmlentities/htmlentities.rb
48
47
  - test/unit
49
48
  - test/fixtures
50
49
  - test/test_helper.rb
51
- - test/unit/etag_filter_test.rb
52
- - test/unit/url_filters_test.rb
53
50
  - test/unit/html_content_extractor_test.rb
54
- - test/unit/pdf_content_extractor_test.rb
51
+ - test/unit/url_filters_test.rb
55
52
  - test/unit/word_content_extractor_test.rb
56
- - test/unit/file_document_test.rb
57
53
  - test/unit/crawler_fs_test.rb
58
- - test/fixtures/html
54
+ - test/unit/etag_filter_test.rb
55
+ - test/unit/pdf_content_extractor_test.rb
56
+ - test/unit/file_document_test.rb
59
57
  - test/fixtures/pdf
58
+ - test/fixtures/html
60
59
  - test/fixtures/word
60
+ - test/fixtures/pdf/simple.pdf
61
61
  - test/fixtures/html/entities.html
62
- - test/fixtures/html/simple.html
63
62
  - test/fixtures/html/custom_tag_selectors.html
64
- - test/fixtures/pdf/simple.pdf
63
+ - test/fixtures/html/simple.html
65
64
  - test/fixtures/word/simple.doc
66
65
  - doc/examples
67
66
  - doc/examples/config.rb
68
- - LICENSE
69
67
  - TODO
70
- - CHANGES
68
+ - LICENSE
71
69
  - README
70
+ - CHANGES
72
71
  - install.rb
73
72
  - rakefile
74
73
  test_files: []