rdig 0.3.1 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGES +7 -0
- data/README +1 -1
- data/lib/rdig.rb +2 -2
- data/lib/rdig/index.rb +11 -12
- data/rakefile +1 -1
- metadata +21 -20
data/CHANGES
CHANGED
@@ -1,3 +1,10 @@
|
|
1
|
+
0.3.2
|
2
|
+
- make RDig compatible with Ferret 0.10.x
|
3
|
+
- won't work any more with Ferret 0.9.x and before
|
4
|
+
|
5
|
+
0.3.1
|
6
|
+
- Bug fix release: fixed handling of unparseable URLs
|
7
|
+
|
1
8
|
0.3.0
|
2
9
|
- file system crawling
|
3
10
|
- optional url rewriting before indexing, e.g. for linking to results
|
data/README
CHANGED
@@ -5,7 +5,7 @@ to help building a site search for web sites or intranets. Internally,
|
|
5
5
|
Ferret is used for the full text indexing. After creating a config file
|
6
6
|
for your site, the index can be built with a single call to rdig.
|
7
7
|
|
8
|
-
RDig depends on Ferret (>= 0.
|
8
|
+
RDig depends on Ferret (>= 0.10.0) and the RubyfulSoup library (>= 1.0.4).
|
9
9
|
|
10
10
|
== basic usage
|
11
11
|
|
data/lib/rdig.rb
CHANGED
@@ -24,7 +24,7 @@
|
|
24
24
|
#++
|
25
25
|
#
|
26
26
|
|
27
|
-
RDIGVERSION = '0.3.
|
27
|
+
RDIGVERSION = '0.3.2'
|
28
28
|
|
29
29
|
|
30
30
|
require 'thread'
|
@@ -142,7 +142,7 @@ module RDig
|
|
142
142
|
:create => true,
|
143
143
|
:handle_parse_errors => true,
|
144
144
|
:analyzer => Ferret::Analysis::StandardAnalyzer.new,
|
145
|
-
:occur_default =>
|
145
|
+
:occur_default => :must
|
146
146
|
)
|
147
147
|
)
|
148
148
|
end
|
data/lib/rdig/index.rb
CHANGED
@@ -3,27 +3,26 @@ module RDig
|
|
3
3
|
|
4
4
|
# used by the crawler to build the ferret index
|
5
5
|
class Indexer
|
6
|
-
include MonitorMixin
|
6
|
+
include MonitorMixin
|
7
7
|
|
8
8
|
def initialize(settings)
|
9
9
|
@config = settings
|
10
|
-
@index_writer = IndexWriter.new(
|
11
|
-
|
12
|
-
|
10
|
+
@index_writer = Ferret::Index::IndexWriter.new(
|
11
|
+
:path => settings.path,
|
12
|
+
:create => settings.create,
|
13
|
+
:analyzer => settings.analyzer)
|
13
14
|
super() # scary, MonitorMixin won't initialize if we don't call super() here (parens matter)
|
14
15
|
end
|
15
16
|
|
16
17
|
def add_to_index(document)
|
17
18
|
puts "add to index: #{document.uri.to_s}" if RDig::config.verbose
|
18
|
-
doc = Ferret::Document::Document.new
|
19
19
|
@config.rewrite_uri.call(document.uri) if @config.rewrite_uri
|
20
|
-
|
21
|
-
doc
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
Field::Store::YES, Field::Index::TOKENIZED)
|
20
|
+
# all stored and tokenized, should be ferret defaults
|
21
|
+
doc = {
|
22
|
+
:url => document.uri.to_s,
|
23
|
+
:title => document.title,
|
24
|
+
:data => document.body
|
25
|
+
}
|
27
26
|
synchronize do
|
28
27
|
@index_writer << doc
|
29
28
|
end
|
data/rakefile
CHANGED
metadata
CHANGED
@@ -1,10 +1,10 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
|
-
rubygems_version: 0.
|
2
|
+
rubygems_version: 0.9.0
|
3
3
|
specification_version: 1
|
4
4
|
name: rdig
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.3.
|
7
|
-
date: 2006-
|
6
|
+
version: 0.3.2
|
7
|
+
date: 2006-10-09 00:00:00 +02:00
|
8
8
|
summary: Ruby based web site indexing and searching library.
|
9
9
|
require_paths:
|
10
10
|
- lib
|
@@ -25,49 +25,50 @@ required_ruby_version: !ruby/object:Gem::Version::Requirement
|
|
25
25
|
platform: ruby
|
26
26
|
signing_key:
|
27
27
|
cert_chain:
|
28
|
+
post_install_message:
|
28
29
|
authors:
|
29
30
|
- Jens Kraemer
|
30
31
|
files:
|
31
32
|
- bin/rdig
|
32
33
|
- lib/rdig
|
33
|
-
- lib/rdig.rb
|
34
34
|
- lib/htmlentities
|
35
|
-
- lib/rdig
|
36
|
-
- lib/rdig/
|
37
|
-
- lib/rdig/content_extractors.rb
|
35
|
+
- lib/rdig.rb
|
36
|
+
- lib/rdig/crawler.rb
|
38
37
|
- lib/rdig/search.rb
|
39
38
|
- lib/rdig/highlight.rb
|
40
39
|
- lib/rdig/index.rb
|
41
40
|
- lib/rdig/url_filters.rb
|
42
|
-
- lib/rdig/
|
43
|
-
- lib/
|
44
|
-
- lib/
|
41
|
+
- lib/rdig/content_extractors.rb
|
42
|
+
- lib/rdig/documents.rb
|
43
|
+
- lib/rdig/file.rb
|
45
44
|
- lib/htmlentities/CHANGES
|
46
45
|
- lib/htmlentities/COPYING
|
46
|
+
- lib/htmlentities/README
|
47
|
+
- lib/htmlentities/htmlentities.rb
|
47
48
|
- test/unit
|
48
49
|
- test/fixtures
|
49
50
|
- test/test_helper.rb
|
50
|
-
- test/unit/html_content_extractor_test.rb
|
51
|
-
- test/unit/url_filters_test.rb
|
52
|
-
- test/unit/word_content_extractor_test.rb
|
53
|
-
- test/unit/crawler_fs_test.rb
|
54
51
|
- test/unit/etag_filter_test.rb
|
52
|
+
- test/unit/url_filters_test.rb
|
53
|
+
- test/unit/html_content_extractor_test.rb
|
55
54
|
- test/unit/pdf_content_extractor_test.rb
|
55
|
+
- test/unit/word_content_extractor_test.rb
|
56
56
|
- test/unit/file_document_test.rb
|
57
|
-
- test/
|
57
|
+
- test/unit/crawler_fs_test.rb
|
58
58
|
- test/fixtures/html
|
59
|
+
- test/fixtures/pdf
|
59
60
|
- test/fixtures/word
|
60
|
-
- test/fixtures/pdf/simple.pdf
|
61
61
|
- test/fixtures/html/entities.html
|
62
|
-
- test/fixtures/html/custom_tag_selectors.html
|
63
62
|
- test/fixtures/html/simple.html
|
63
|
+
- test/fixtures/html/custom_tag_selectors.html
|
64
|
+
- test/fixtures/pdf/simple.pdf
|
64
65
|
- test/fixtures/word/simple.doc
|
65
66
|
- doc/examples
|
66
67
|
- doc/examples/config.rb
|
67
|
-
- TODO
|
68
68
|
- LICENSE
|
69
|
-
-
|
69
|
+
- TODO
|
70
70
|
- CHANGES
|
71
|
+
- README
|
71
72
|
- install.rb
|
72
73
|
- rakefile
|
73
74
|
test_files: []
|
@@ -97,7 +98,7 @@ dependencies:
|
|
97
98
|
requirements:
|
98
99
|
- - ">="
|
99
100
|
- !ruby/object:Gem::Version
|
100
|
-
version: 0.
|
101
|
+
version: 0.10.0
|
101
102
|
version:
|
102
103
|
- !ruby/object:Gem::Dependency
|
103
104
|
name: rubyful_soup
|