rdig 0.3.3 → 0.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +5 -1
- data/doc/examples/config.rb +6 -13
- data/lib/rdig.rb +1 -1
- data/lib/rdig/content_extractors.rb +6 -7
- data/lib/rdig/content_extractors/doc.rb +19 -10
- data/lib/rdig/content_extractors/hpricot.rb +3 -1
- data/lib/rdig/content_extractors/rubyful_soup.rb +5 -1
- data/rakefile +1 -1
- metadata +12 -4
data/README
CHANGED
@@ -5,7 +5,11 @@ to help building a site search for web sites or intranets. Internally,
|
|
5
5
|
Ferret is used for the full text indexing. After creating a config file
|
6
6
|
for your site, the index can be built with a single call to rdig.
|
7
7
|
|
8
|
-
RDig depends on Ferret (>= 0.10.0) and
|
8
|
+
RDig depends on Ferret (>= 0.10.0) and, for parsing HTML, on either
|
9
|
+
Hpricot (>= 0.4) or the RubyfulSoup library (>= 1.0.4). As I know no way
|
10
|
+
to specify such an OR dependency in a gem specification, the gem depends
|
11
|
+
on Hpricot. If this is a problem for you, install the gem with --force and
|
12
|
+
manually do a +gem install rubyful_soup+.
|
9
13
|
|
10
14
|
== basic usage
|
11
15
|
|
data/doc/examples/config.rb
CHANGED
@@ -49,28 +49,21 @@ RDig.configuration do |cfg|
|
|
49
49
|
# hpricot config above, and uncomment the following:
|
50
50
|
#
|
51
51
|
# :rubyful_soup => OpenStruct.new(
|
52
|
-
# #
|
53
|
-
# #
|
52
|
+
# # provide a method that returns the title of an html document
|
53
|
+
# # this method may either return a tag to extract the title from,
|
54
|
+
# # or a ready-to-index string.
|
54
55
|
# :content_tag_selector => lambda { |tagsoup|
|
55
56
|
# tagsoup.html.body
|
56
57
|
# },
|
57
|
-
# #
|
58
|
+
# # provide a method that selects the tag containing the page content you
|
59
|
+
# # want to index. Useful to avoid indexing common elements like navigation
|
60
|
+
# # and page footers for every page.
|
58
61
|
# :title_tag_selector => lambda { |tagsoup|
|
59
62
|
# tagsoup.html.head.title
|
60
63
|
# }
|
61
64
|
# )
|
62
65
|
)
|
63
66
|
|
64
|
-
# provide a method that returns the title of an html document
|
65
|
-
# this method may either return a tag to extract the title from,
|
66
|
-
# or a ready-to-index string.
|
67
|
-
# cfg.content_extraction.html.title_tag_selector = lambda { |tagsoup| tagsoup.html.head.title }
|
68
|
-
|
69
|
-
# provide a method that selects the tag containing the page content you
|
70
|
-
# want to index. Useful to avoid indexing common elements like navigation
|
71
|
-
# and page footers for every page.
|
72
|
-
# cfg.content_extraction.html.content_tag_selector = lambda { |tagsoup| tagsoup.html.body }
|
73
|
-
|
74
67
|
# crawler options
|
75
68
|
|
76
69
|
# Notice: for file system crawling the include/exclude_document patterns are
|
data/lib/rdig.rb
CHANGED
@@ -16,15 +16,15 @@ module RDig
|
|
16
16
|
|
17
17
|
def self.inherited(extractor)
|
18
18
|
super(extractor)
|
19
|
-
puts("discovered content extractor class: #{extractor}") if RDig.config.verbose
|
20
19
|
self.extractors << extractor
|
21
20
|
end
|
22
21
|
|
23
22
|
def self.extractors; @@extractors ||= [] end
|
24
23
|
def self.extractor_instances
|
25
24
|
@@extractor_instances ||= extractors.map { |ex_class|
|
26
|
-
ex_class
|
27
|
-
|
25
|
+
puts "initializing content extractor: #{ex_class}" if RDig.configuration.verbose
|
26
|
+
ex_class.new(RDig.configuration.content_extraction) rescue nil
|
27
|
+
}.compact
|
28
28
|
end
|
29
29
|
|
30
30
|
def self.process(content, content_type)
|
@@ -32,7 +32,6 @@ module RDig
|
|
32
32
|
return extractor.process(content) if extractor.can_do(content_type)
|
33
33
|
}
|
34
34
|
puts "unable to handle content type #{content_type}"
|
35
|
-
nil
|
36
35
|
end
|
37
36
|
|
38
37
|
def initialize(config)
|
@@ -78,8 +77,8 @@ end
|
|
78
77
|
# load content extractors
|
79
78
|
Dir["#{File.expand_path(File.dirname(__FILE__))}/content_extractors/**/*.rb"].each do |f|
|
80
79
|
begin
|
81
|
-
require f
|
82
|
-
rescue
|
83
|
-
puts "
|
80
|
+
require f
|
81
|
+
rescue LoadError
|
82
|
+
puts "could not load #{f}: #{$!}"
|
84
83
|
end
|
85
84
|
end
|
@@ -13,16 +13,25 @@ module RDig
|
|
13
13
|
@wvhtml = 'wvHtml'
|
14
14
|
@pattern = /^application\/msword/
|
15
15
|
# html extractor for parsing wvHtml output
|
16
|
-
|
17
|
-
|
18
|
-
:
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
16
|
+
if defined?(HpricotContentExtractor)
|
17
|
+
@html_extractor = HpricotContentExtractor.new(OpenStruct.new(
|
18
|
+
:hpricot => OpenStruct.new(
|
19
|
+
:content_tag_selector => 'body',
|
20
|
+
:title_tag_selector => 'title'
|
21
|
+
)))
|
22
|
+
elsif defined?(RubyfulSoupContentExtractor)
|
23
|
+
@html_extractor = RubyfulSoupContentExtractor.new(OpenStruct.new(
|
24
|
+
:rubyful_soup => OpenStruct.new(
|
25
|
+
:content_tag_selector => lambda { |tagsoup|
|
26
|
+
tagsoup.html.body
|
27
|
+
},
|
28
|
+
:title_tag_selector => lambda { |tagsoup|
|
29
|
+
tagsoup.html.head.title
|
30
|
+
}
|
31
|
+
)))
|
32
|
+
else
|
33
|
+
raise "need at least one html content extractor - please install hpricot or rubyful_soup"
|
34
|
+
end
|
26
35
|
# TODO: better: if $?.exitstatus == 127 (not found)
|
27
36
|
@available = %x{#{@wvhtml} -h 2>&1} =~ /Dom Lachowicz/
|
28
37
|
end
|
@@ -5,6 +5,7 @@ rescue LoadError
|
|
5
5
|
require 'hpricot'
|
6
6
|
end
|
7
7
|
|
8
|
+
if defined?(Hpricot)
|
8
9
|
module RDig
|
9
10
|
module ContentExtractors
|
10
11
|
|
@@ -24,7 +25,7 @@ module RDig
|
|
24
25
|
def process(content)
|
25
26
|
doc = Hpricot(content)
|
26
27
|
{
|
27
|
-
:title => extract_title(doc).decode_entities,
|
28
|
+
:title => extract_title(doc).decode_entities.strip,
|
28
29
|
:links => extract_links(doc),
|
29
30
|
:content => extract_content(doc).decode_entities
|
30
31
|
}
|
@@ -97,3 +98,4 @@ module RDig
|
|
97
98
|
|
98
99
|
end
|
99
100
|
end
|
101
|
+
end
|
@@ -2,8 +2,10 @@ begin
|
|
2
2
|
require 'rubyful_soup'
|
3
3
|
rescue LoadError
|
4
4
|
require 'rubygems'
|
5
|
-
require 'rubyful_soup'
|
5
|
+
require 'rubyful_soup' rescue nil
|
6
6
|
end
|
7
|
+
|
8
|
+
if defined?(BeautifulSoup)
|
7
9
|
|
8
10
|
# override some methods concered with entity resolving
|
9
11
|
# to convert them to strings
|
@@ -145,3 +147,5 @@ module RDig
|
|
145
147
|
|
146
148
|
end
|
147
149
|
end
|
150
|
+
|
151
|
+
end
|
data/rakefile
CHANGED
@@ -134,7 +134,7 @@ else
|
|
134
134
|
# TODO: check if there is anything like 'suggested' instead of required, or
|
135
135
|
# ORed dependencies...
|
136
136
|
#s.add_dependency('rubyful_soup', '>= 1.0.4')
|
137
|
-
|
137
|
+
s.add_dependency('hpricot', '>= 0.4')
|
138
138
|
#s.requirements << ""
|
139
139
|
|
140
140
|
#### Which files are to be included in this gem? Everything! (Except CVS directories.)
|
metadata
CHANGED
@@ -1,10 +1,10 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
|
-
rubygems_version: 0.
|
2
|
+
rubygems_version: 0.8.11
|
3
3
|
specification_version: 1
|
4
4
|
name: rdig
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.3.
|
7
|
-
date: 2006-
|
6
|
+
version: 0.3.4
|
7
|
+
date: 2006-12-31 00:00:00 +01:00
|
8
8
|
summary: Ruby based web site indexing and searching library.
|
9
9
|
require_paths:
|
10
10
|
- lib
|
@@ -25,7 +25,6 @@ required_ruby_version: !ruby/object:Gem::Version::Requirement
|
|
25
25
|
platform: ruby
|
26
26
|
signing_key:
|
27
27
|
cert_chain:
|
28
|
-
post_install_message:
|
29
28
|
authors:
|
30
29
|
- Jens Kraemer
|
31
30
|
files:
|
@@ -107,3 +106,12 @@ dependencies:
|
|
107
106
|
- !ruby/object:Gem::Version
|
108
107
|
version: 0.10.0
|
109
108
|
version:
|
109
|
+
- !ruby/object:Gem::Dependency
|
110
|
+
name: hpricot
|
111
|
+
version_requirement:
|
112
|
+
version_requirements: !ruby/object:Gem::Version::Requirement
|
113
|
+
requirements:
|
114
|
+
- - ">="
|
115
|
+
- !ruby/object:Gem::Version
|
116
|
+
version: "0.4"
|
117
|
+
version:
|