rdig 0.3.3 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README CHANGED
@@ -5,7 +5,11 @@ to help building a site search for web sites or intranets. Internally,
5
5
  Ferret is used for the full text indexing. After creating a config file
6
6
  for your site, the index can be built with a single call to rdig.
7
7
 
8
- RDig depends on Ferret (>= 0.10.0) and the RubyfulSoup library (>= 1.0.4).
8
+ RDig depends on Ferret (>= 0.10.0) and, for parsing HTML, on either
9
+ Hpricot (>= 0.4) or the RubyfulSoup library (>= 1.0.4). As I know no way
10
+ to specify such an OR dependency in a gem specification, the gem depends
11
+ on Hpricot. If this is a problem for you, install the gem with --force and
12
+ manually do a +gem install rubyful_soup+.
9
13
 
10
14
  == basic usage
11
15
 
@@ -49,28 +49,21 @@ RDig.configuration do |cfg|
49
49
  # hpricot config above, and uncomment the following:
50
50
  #
51
51
  # :rubyful_soup => OpenStruct.new(
52
- # # select the html element that contains the content to index
53
- # # by default, we index all inside the body tag:
52
+ # # provide a method that returns the title of an html document
53
+ # # this method may either return a tag to extract the title from,
54
+ # # or a ready-to-index string.
54
55
  # :content_tag_selector => lambda { |tagsoup|
55
56
  # tagsoup.html.body
56
57
  # },
57
- # # select the html element containing the title
58
+ # # provide a method that selects the tag containing the page content you
59
+ # # want to index. Useful to avoid indexing common elements like navigation
60
+ # # and page footers for every page.
58
61
  # :title_tag_selector => lambda { |tagsoup|
59
62
  # tagsoup.html.head.title
60
63
  # }
61
64
  # )
62
65
  )
63
66
 
64
- # provide a method that returns the title of an html document
65
- # this method may either return a tag to extract the title from,
66
- # or a ready-to-index string.
67
- # cfg.content_extraction.html.title_tag_selector = lambda { |tagsoup| tagsoup.html.head.title }
68
-
69
- # provide a method that selects the tag containing the page content you
70
- # want to index. Useful to avoid indexing common elements like navigation
71
- # and page footers for every page.
72
- # cfg.content_extraction.html.content_tag_selector = lambda { |tagsoup| tagsoup.html.body }
73
-
74
67
  # crawler options
75
68
 
76
69
  # Notice: for file system crawling the include/exclude_document patterns are
@@ -24,7 +24,7 @@
24
24
  #++
25
25
  #
26
26
 
27
- RDIGVERSION = '0.3.3'
27
+ RDIGVERSION = '0.3.4'
28
28
 
29
29
 
30
30
  require 'thread'
@@ -16,15 +16,15 @@ module RDig
16
16
 
17
17
  def self.inherited(extractor)
18
18
  super(extractor)
19
- puts("discovered content extractor class: #{extractor}") if RDig.config.verbose
20
19
  self.extractors << extractor
21
20
  end
22
21
 
23
22
  def self.extractors; @@extractors ||= [] end
24
23
  def self.extractor_instances
25
24
  @@extractor_instances ||= extractors.map { |ex_class|
26
- ex_class.new(RDig.configuration.content_extraction)
27
- }
25
+ puts "initializing content extractor: #{ex_class}" if RDig.configuration.verbose
26
+ ex_class.new(RDig.configuration.content_extraction) rescue nil
27
+ }.compact
28
28
  end
29
29
 
30
30
  def self.process(content, content_type)
@@ -32,7 +32,6 @@ module RDig
32
32
  return extractor.process(content) if extractor.can_do(content_type)
33
33
  }
34
34
  puts "unable to handle content type #{content_type}"
35
- nil
36
35
  end
37
36
 
38
37
  def initialize(config)
@@ -78,8 +77,8 @@ end
78
77
  # load content extractors
79
78
  Dir["#{File.expand_path(File.dirname(__FILE__))}/content_extractors/**/*.rb"].each do |f|
80
79
  begin
81
- require f
82
- rescue
83
- puts "error loading #{f}: #{$!}"
80
+ require f
81
+ rescue LoadError
82
+ puts "could not load #{f}: #{$!}"
84
83
  end
85
84
  end
@@ -13,16 +13,25 @@ module RDig
13
13
  @wvhtml = 'wvHtml'
14
14
  @pattern = /^application\/msword/
15
15
  # html extractor for parsing wvHtml output
16
- @html_extractor = RubyfulSoupContentExtractor.new(OpenStruct.new(
17
- :rubyful_soup => OpenStruct.new(
18
- :content_tag_selector => lambda { |tagsoup|
19
- tagsoup.html.body
20
- },
21
- :title_tag_selector => lambda { |tagsoup|
22
- tagsoup.html.head.title
23
- }
24
- )))
25
-
16
+ if defined?(HpricotContentExtractor)
17
+ @html_extractor = HpricotContentExtractor.new(OpenStruct.new(
18
+ :hpricot => OpenStruct.new(
19
+ :content_tag_selector => 'body',
20
+ :title_tag_selector => 'title'
21
+ )))
22
+ elsif defined?(RubyfulSoupContentExtractor)
23
+ @html_extractor = RubyfulSoupContentExtractor.new(OpenStruct.new(
24
+ :rubyful_soup => OpenStruct.new(
25
+ :content_tag_selector => lambda { |tagsoup|
26
+ tagsoup.html.body
27
+ },
28
+ :title_tag_selector => lambda { |tagsoup|
29
+ tagsoup.html.head.title
30
+ }
31
+ )))
32
+ else
33
+ raise "need at least one html content extractor - please install hpricot or rubyful_soup"
34
+ end
26
35
  # TODO: better: if $?.exitstatus == 127 (not found)
27
36
  @available = %x{#{@wvhtml} -h 2>&1} =~ /Dom Lachowicz/
28
37
  end
@@ -5,6 +5,7 @@ rescue LoadError
5
5
  require 'hpricot'
6
6
  end
7
7
 
8
+ if defined?(Hpricot)
8
9
  module RDig
9
10
  module ContentExtractors
10
11
 
@@ -24,7 +25,7 @@ module RDig
24
25
  def process(content)
25
26
  doc = Hpricot(content)
26
27
  {
27
- :title => extract_title(doc).decode_entities,
28
+ :title => extract_title(doc).decode_entities.strip,
28
29
  :links => extract_links(doc),
29
30
  :content => extract_content(doc).decode_entities
30
31
  }
@@ -97,3 +98,4 @@ module RDig
97
98
 
98
99
  end
99
100
  end
101
+ end
@@ -2,8 +2,10 @@ begin
2
2
  require 'rubyful_soup'
3
3
  rescue LoadError
4
4
  require 'rubygems'
5
- require 'rubyful_soup'
5
+ require 'rubyful_soup' rescue nil
6
6
  end
7
+
8
+ if defined?(BeautifulSoup)
7
9
 
8
10
  # override some methods concered with entity resolving
9
11
  # to convert them to strings
@@ -145,3 +147,5 @@ module RDig
145
147
 
146
148
  end
147
149
  end
150
+
151
+ end
data/rakefile CHANGED
@@ -134,7 +134,7 @@ else
134
134
  # TODO: check if there is anything like 'suggested' instead of required, or
135
135
  # ORed dependencies...
136
136
  #s.add_dependency('rubyful_soup', '>= 1.0.4')
137
- #s.add_dependency('hpricot', '>= 0.4')
137
+ s.add_dependency('hpricot', '>= 0.4')
138
138
  #s.requirements << ""
139
139
 
140
140
  #### Which files are to be included in this gem? Everything! (Except CVS directories.)
metadata CHANGED
@@ -1,10 +1,10 @@
1
1
  --- !ruby/object:Gem::Specification
2
- rubygems_version: 0.9.0
2
+ rubygems_version: 0.8.11
3
3
  specification_version: 1
4
4
  name: rdig
5
5
  version: !ruby/object:Gem::Version
6
- version: 0.3.3
7
- date: 2006-10-23 00:00:00 +02:00
6
+ version: 0.3.4
7
+ date: 2006-12-31 00:00:00 +01:00
8
8
  summary: Ruby based web site indexing and searching library.
9
9
  require_paths:
10
10
  - lib
@@ -25,7 +25,6 @@ required_ruby_version: !ruby/object:Gem::Version::Requirement
25
25
  platform: ruby
26
26
  signing_key:
27
27
  cert_chain:
28
- post_install_message:
29
28
  authors:
30
29
  - Jens Kraemer
31
30
  files:
@@ -107,3 +106,12 @@ dependencies:
107
106
  - !ruby/object:Gem::Version
108
107
  version: 0.10.0
109
108
  version:
109
+ - !ruby/object:Gem::Dependency
110
+ name: hpricot
111
+ version_requirement:
112
+ version_requirements: !ruby/object:Gem::Version::Requirement
113
+ requirements:
114
+ - - ">="
115
+ - !ruby/object:Gem::Version
116
+ version: "0.4"
117
+ version: