rdig 0.3.1 → 0.3.2
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGES +7 -0
- data/README +1 -1
- data/lib/rdig.rb +2 -2
- data/lib/rdig/index.rb +11 -12
- data/rakefile +1 -1
- metadata +21 -20
data/CHANGES
CHANGED
@@ -1,3 +1,10 @@
|
|
1
|
+
0.3.2
|
2
|
+
- make RDig compatible with Ferret 0.10.x
|
3
|
+
- won't work any more with Ferret 0.9.x and before
|
4
|
+
|
5
|
+
0.3.1
|
6
|
+
- Bug fix release: fixed handling of unparseable URLs
|
7
|
+
|
1
8
|
0.3.0
|
2
9
|
- file system crawling
|
3
10
|
- optional url rewriting before indexing, e.g. for linking to results
|
data/README
CHANGED
@@ -5,7 +5,7 @@ to help building a site search for web sites or intranets. Internally,
|
|
5
5
|
Ferret is used for the full text indexing. After creating a config file
|
6
6
|
for your site, the index can be built with a single call to rdig.
|
7
7
|
|
8
|
-
RDig depends on Ferret (>= 0.
|
8
|
+
RDig depends on Ferret (>= 0.10.0) and the RubyfulSoup library (>= 1.0.4).
|
9
9
|
|
10
10
|
== basic usage
|
11
11
|
|
data/lib/rdig.rb
CHANGED
@@ -24,7 +24,7 @@
|
|
24
24
|
#++
|
25
25
|
#
|
26
26
|
|
27
|
-
RDIGVERSION = '0.3.
|
27
|
+
RDIGVERSION = '0.3.2'
|
28
28
|
|
29
29
|
|
30
30
|
require 'thread'
|
@@ -142,7 +142,7 @@ module RDig
|
|
142
142
|
:create => true,
|
143
143
|
:handle_parse_errors => true,
|
144
144
|
:analyzer => Ferret::Analysis::StandardAnalyzer.new,
|
145
|
-
:occur_default =>
|
145
|
+
:occur_default => :must
|
146
146
|
)
|
147
147
|
)
|
148
148
|
end
|
data/lib/rdig/index.rb
CHANGED
@@ -3,27 +3,26 @@ module RDig
|
|
3
3
|
|
4
4
|
# used by the crawler to build the ferret index
|
5
5
|
class Indexer
|
6
|
-
include MonitorMixin
|
6
|
+
include MonitorMixin
|
7
7
|
|
8
8
|
def initialize(settings)
|
9
9
|
@config = settings
|
10
|
-
@index_writer = IndexWriter.new(
|
11
|
-
|
12
|
-
|
10
|
+
@index_writer = Ferret::Index::IndexWriter.new(
|
11
|
+
:path => settings.path,
|
12
|
+
:create => settings.create,
|
13
|
+
:analyzer => settings.analyzer)
|
13
14
|
super() # scary, MonitorMixin won't initialize if we don't call super() here (parens matter)
|
14
15
|
end
|
15
16
|
|
16
17
|
def add_to_index(document)
|
17
18
|
puts "add to index: #{document.uri.to_s}" if RDig::config.verbose
|
18
|
-
doc = Ferret::Document::Document.new
|
19
19
|
@config.rewrite_uri.call(document.uri) if @config.rewrite_uri
|
20
|
-
|
21
|
-
doc
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
Field::Store::YES, Field::Index::TOKENIZED)
|
20
|
+
# all stored and tokenized, should be ferret defaults
|
21
|
+
doc = {
|
22
|
+
:url => document.uri.to_s,
|
23
|
+
:title => document.title,
|
24
|
+
:data => document.body
|
25
|
+
}
|
27
26
|
synchronize do
|
28
27
|
@index_writer << doc
|
29
28
|
end
|
data/rakefile
CHANGED
metadata
CHANGED
@@ -1,10 +1,10 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
|
-
rubygems_version: 0.
|
2
|
+
rubygems_version: 0.9.0
|
3
3
|
specification_version: 1
|
4
4
|
name: rdig
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.3.
|
7
|
-
date: 2006-
|
6
|
+
version: 0.3.2
|
7
|
+
date: 2006-10-09 00:00:00 +02:00
|
8
8
|
summary: Ruby based web site indexing and searching library.
|
9
9
|
require_paths:
|
10
10
|
- lib
|
@@ -25,49 +25,50 @@ required_ruby_version: !ruby/object:Gem::Version::Requirement
|
|
25
25
|
platform: ruby
|
26
26
|
signing_key:
|
27
27
|
cert_chain:
|
28
|
+
post_install_message:
|
28
29
|
authors:
|
29
30
|
- Jens Kraemer
|
30
31
|
files:
|
31
32
|
- bin/rdig
|
32
33
|
- lib/rdig
|
33
|
-
- lib/rdig.rb
|
34
34
|
- lib/htmlentities
|
35
|
-
- lib/rdig
|
36
|
-
- lib/rdig/
|
37
|
-
- lib/rdig/content_extractors.rb
|
35
|
+
- lib/rdig.rb
|
36
|
+
- lib/rdig/crawler.rb
|
38
37
|
- lib/rdig/search.rb
|
39
38
|
- lib/rdig/highlight.rb
|
40
39
|
- lib/rdig/index.rb
|
41
40
|
- lib/rdig/url_filters.rb
|
42
|
-
- lib/rdig/
|
43
|
-
- lib/
|
44
|
-
- lib/
|
41
|
+
- lib/rdig/content_extractors.rb
|
42
|
+
- lib/rdig/documents.rb
|
43
|
+
- lib/rdig/file.rb
|
45
44
|
- lib/htmlentities/CHANGES
|
46
45
|
- lib/htmlentities/COPYING
|
46
|
+
- lib/htmlentities/README
|
47
|
+
- lib/htmlentities/htmlentities.rb
|
47
48
|
- test/unit
|
48
49
|
- test/fixtures
|
49
50
|
- test/test_helper.rb
|
50
|
-
- test/unit/html_content_extractor_test.rb
|
51
|
-
- test/unit/url_filters_test.rb
|
52
|
-
- test/unit/word_content_extractor_test.rb
|
53
|
-
- test/unit/crawler_fs_test.rb
|
54
51
|
- test/unit/etag_filter_test.rb
|
52
|
+
- test/unit/url_filters_test.rb
|
53
|
+
- test/unit/html_content_extractor_test.rb
|
55
54
|
- test/unit/pdf_content_extractor_test.rb
|
55
|
+
- test/unit/word_content_extractor_test.rb
|
56
56
|
- test/unit/file_document_test.rb
|
57
|
-
- test/
|
57
|
+
- test/unit/crawler_fs_test.rb
|
58
58
|
- test/fixtures/html
|
59
|
+
- test/fixtures/pdf
|
59
60
|
- test/fixtures/word
|
60
|
-
- test/fixtures/pdf/simple.pdf
|
61
61
|
- test/fixtures/html/entities.html
|
62
|
-
- test/fixtures/html/custom_tag_selectors.html
|
63
62
|
- test/fixtures/html/simple.html
|
63
|
+
- test/fixtures/html/custom_tag_selectors.html
|
64
|
+
- test/fixtures/pdf/simple.pdf
|
64
65
|
- test/fixtures/word/simple.doc
|
65
66
|
- doc/examples
|
66
67
|
- doc/examples/config.rb
|
67
|
-
- TODO
|
68
68
|
- LICENSE
|
69
|
-
-
|
69
|
+
- TODO
|
70
70
|
- CHANGES
|
71
|
+
- README
|
71
72
|
- install.rb
|
72
73
|
- rakefile
|
73
74
|
test_files: []
|
@@ -97,7 +98,7 @@ dependencies:
|
|
97
98
|
requirements:
|
98
99
|
- - ">="
|
99
100
|
- !ruby/object:Gem::Version
|
100
|
-
version: 0.
|
101
|
+
version: 0.10.0
|
101
102
|
version:
|
102
103
|
- !ruby/object:Gem::Dependency
|
103
104
|
name: rubyful_soup
|