rdig 0.3.3 → 0.3.4
Sign up to get free protection for your applications and to get access to all the features.
- data/README +5 -1
- data/doc/examples/config.rb +6 -13
- data/lib/rdig.rb +1 -1
- data/lib/rdig/content_extractors.rb +6 -7
- data/lib/rdig/content_extractors/doc.rb +19 -10
- data/lib/rdig/content_extractors/hpricot.rb +3 -1
- data/lib/rdig/content_extractors/rubyful_soup.rb +5 -1
- data/rakefile +1 -1
- metadata +12 -4
data/README
CHANGED
@@ -5,7 +5,11 @@ to help building a site search for web sites or intranets. Internally,
|
|
5
5
|
Ferret is used for the full text indexing. After creating a config file
|
6
6
|
for your site, the index can be built with a single call to rdig.
|
7
7
|
|
8
|
-
RDig depends on Ferret (>= 0.10.0) and
|
8
|
+
RDig depends on Ferret (>= 0.10.0) and, for parsing HTML, on either
|
9
|
+
Hpricot (>= 0.4) or the RubyfulSoup library (>= 1.0.4). As I know no way
|
10
|
+
to specify such an OR dependency in a gem specification, the gem depends
|
11
|
+
on Hpricot. If this is a problem for you, install the gem with --force and
|
12
|
+
manually do a +gem install rubyful_soup+.
|
9
13
|
|
10
14
|
== basic usage
|
11
15
|
|
data/doc/examples/config.rb
CHANGED
@@ -49,28 +49,21 @@ RDig.configuration do |cfg|
|
|
49
49
|
# hpricot config above, and uncomment the following:
|
50
50
|
#
|
51
51
|
# :rubyful_soup => OpenStruct.new(
|
52
|
-
# #
|
53
|
-
# #
|
52
|
+
# # provide a method that returns the title of an html document
|
53
|
+
# # this method may either return a tag to extract the title from,
|
54
|
+
# # or a ready-to-index string.
|
54
55
|
# :content_tag_selector => lambda { |tagsoup|
|
55
56
|
# tagsoup.html.body
|
56
57
|
# },
|
57
|
-
# #
|
58
|
+
# # provide a method that selects the tag containing the page content you
|
59
|
+
# # want to index. Useful to avoid indexing common elements like navigation
|
60
|
+
# # and page footers for every page.
|
58
61
|
# :title_tag_selector => lambda { |tagsoup|
|
59
62
|
# tagsoup.html.head.title
|
60
63
|
# }
|
61
64
|
# )
|
62
65
|
)
|
63
66
|
|
64
|
-
# provide a method that returns the title of an html document
|
65
|
-
# this method may either return a tag to extract the title from,
|
66
|
-
# or a ready-to-index string.
|
67
|
-
# cfg.content_extraction.html.title_tag_selector = lambda { |tagsoup| tagsoup.html.head.title }
|
68
|
-
|
69
|
-
# provide a method that selects the tag containing the page content you
|
70
|
-
# want to index. Useful to avoid indexing common elements like navigation
|
71
|
-
# and page footers for every page.
|
72
|
-
# cfg.content_extraction.html.content_tag_selector = lambda { |tagsoup| tagsoup.html.body }
|
73
|
-
|
74
67
|
# crawler options
|
75
68
|
|
76
69
|
# Notice: for file system crawling the include/exclude_document patterns are
|
data/lib/rdig.rb
CHANGED
@@ -16,15 +16,15 @@ module RDig
|
|
16
16
|
|
17
17
|
def self.inherited(extractor)
|
18
18
|
super(extractor)
|
19
|
-
puts("discovered content extractor class: #{extractor}") if RDig.config.verbose
|
20
19
|
self.extractors << extractor
|
21
20
|
end
|
22
21
|
|
23
22
|
def self.extractors; @@extractors ||= [] end
|
24
23
|
def self.extractor_instances
|
25
24
|
@@extractor_instances ||= extractors.map { |ex_class|
|
26
|
-
ex_class
|
27
|
-
|
25
|
+
puts "initializing content extractor: #{ex_class}" if RDig.configuration.verbose
|
26
|
+
ex_class.new(RDig.configuration.content_extraction) rescue nil
|
27
|
+
}.compact
|
28
28
|
end
|
29
29
|
|
30
30
|
def self.process(content, content_type)
|
@@ -32,7 +32,6 @@ module RDig
|
|
32
32
|
return extractor.process(content) if extractor.can_do(content_type)
|
33
33
|
}
|
34
34
|
puts "unable to handle content type #{content_type}"
|
35
|
-
nil
|
36
35
|
end
|
37
36
|
|
38
37
|
def initialize(config)
|
@@ -78,8 +77,8 @@ end
|
|
78
77
|
# load content extractors
|
79
78
|
Dir["#{File.expand_path(File.dirname(__FILE__))}/content_extractors/**/*.rb"].each do |f|
|
80
79
|
begin
|
81
|
-
require f
|
82
|
-
rescue
|
83
|
-
puts "
|
80
|
+
require f
|
81
|
+
rescue LoadError
|
82
|
+
puts "could not load #{f}: #{$!}"
|
84
83
|
end
|
85
84
|
end
|
@@ -13,16 +13,25 @@ module RDig
|
|
13
13
|
@wvhtml = 'wvHtml'
|
14
14
|
@pattern = /^application\/msword/
|
15
15
|
# html extractor for parsing wvHtml output
|
16
|
-
|
17
|
-
|
18
|
-
:
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
16
|
+
if defined?(HpricotContentExtractor)
|
17
|
+
@html_extractor = HpricotContentExtractor.new(OpenStruct.new(
|
18
|
+
:hpricot => OpenStruct.new(
|
19
|
+
:content_tag_selector => 'body',
|
20
|
+
:title_tag_selector => 'title'
|
21
|
+
)))
|
22
|
+
elsif defined?(RubyfulSoupContentExtractor)
|
23
|
+
@html_extractor = RubyfulSoupContentExtractor.new(OpenStruct.new(
|
24
|
+
:rubyful_soup => OpenStruct.new(
|
25
|
+
:content_tag_selector => lambda { |tagsoup|
|
26
|
+
tagsoup.html.body
|
27
|
+
},
|
28
|
+
:title_tag_selector => lambda { |tagsoup|
|
29
|
+
tagsoup.html.head.title
|
30
|
+
}
|
31
|
+
)))
|
32
|
+
else
|
33
|
+
raise "need at least one html content extractor - please install hpricot or rubyful_soup"
|
34
|
+
end
|
26
35
|
# TODO: better: if $?.exitstatus == 127 (not found)
|
27
36
|
@available = %x{#{@wvhtml} -h 2>&1} =~ /Dom Lachowicz/
|
28
37
|
end
|
@@ -5,6 +5,7 @@ rescue LoadError
|
|
5
5
|
require 'hpricot'
|
6
6
|
end
|
7
7
|
|
8
|
+
if defined?(Hpricot)
|
8
9
|
module RDig
|
9
10
|
module ContentExtractors
|
10
11
|
|
@@ -24,7 +25,7 @@ module RDig
|
|
24
25
|
def process(content)
|
25
26
|
doc = Hpricot(content)
|
26
27
|
{
|
27
|
-
:title => extract_title(doc).decode_entities,
|
28
|
+
:title => extract_title(doc).decode_entities.strip,
|
28
29
|
:links => extract_links(doc),
|
29
30
|
:content => extract_content(doc).decode_entities
|
30
31
|
}
|
@@ -97,3 +98,4 @@ module RDig
|
|
97
98
|
|
98
99
|
end
|
99
100
|
end
|
101
|
+
end
|
@@ -2,8 +2,10 @@ begin
|
|
2
2
|
require 'rubyful_soup'
|
3
3
|
rescue LoadError
|
4
4
|
require 'rubygems'
|
5
|
-
require 'rubyful_soup'
|
5
|
+
require 'rubyful_soup' rescue nil
|
6
6
|
end
|
7
|
+
|
8
|
+
if defined?(BeautifulSoup)
|
7
9
|
|
8
10
|
# override some methods concered with entity resolving
|
9
11
|
# to convert them to strings
|
@@ -145,3 +147,5 @@ module RDig
|
|
145
147
|
|
146
148
|
end
|
147
149
|
end
|
150
|
+
|
151
|
+
end
|
data/rakefile
CHANGED
@@ -134,7 +134,7 @@ else
|
|
134
134
|
# TODO: check if there is anything like 'suggested' instead of required, or
|
135
135
|
# ORed dependencies...
|
136
136
|
#s.add_dependency('rubyful_soup', '>= 1.0.4')
|
137
|
-
|
137
|
+
s.add_dependency('hpricot', '>= 0.4')
|
138
138
|
#s.requirements << ""
|
139
139
|
|
140
140
|
#### Which files are to be included in this gem? Everything! (Except CVS directories.)
|
metadata
CHANGED
@@ -1,10 +1,10 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
|
-
rubygems_version: 0.
|
2
|
+
rubygems_version: 0.8.11
|
3
3
|
specification_version: 1
|
4
4
|
name: rdig
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.3.
|
7
|
-
date: 2006-
|
6
|
+
version: 0.3.4
|
7
|
+
date: 2006-12-31 00:00:00 +01:00
|
8
8
|
summary: Ruby based web site indexing and searching library.
|
9
9
|
require_paths:
|
10
10
|
- lib
|
@@ -25,7 +25,6 @@ required_ruby_version: !ruby/object:Gem::Version::Requirement
|
|
25
25
|
platform: ruby
|
26
26
|
signing_key:
|
27
27
|
cert_chain:
|
28
|
-
post_install_message:
|
29
28
|
authors:
|
30
29
|
- Jens Kraemer
|
31
30
|
files:
|
@@ -107,3 +106,12 @@ dependencies:
|
|
107
106
|
- !ruby/object:Gem::Version
|
108
107
|
version: 0.10.0
|
109
108
|
version:
|
109
|
+
- !ruby/object:Gem::Dependency
|
110
|
+
name: hpricot
|
111
|
+
version_requirement:
|
112
|
+
version_requirements: !ruby/object:Gem::Version::Requirement
|
113
|
+
requirements:
|
114
|
+
- - ">="
|
115
|
+
- !ruby/object:Gem::Version
|
116
|
+
version: "0.4"
|
117
|
+
version:
|