rdig 0.3.3 → 0.3.4

Sign up to get free protection for your applications and to get access to all the features.
data/README CHANGED
@@ -5,7 +5,11 @@ to help building a site search for web sites or intranets. Internally,
5
5
  Ferret is used for the full text indexing. After creating a config file
6
6
  for your site, the index can be built with a single call to rdig.
7
7
 
8
- RDig depends on Ferret (>= 0.10.0) and the RubyfulSoup library (>= 1.0.4).
8
+ RDig depends on Ferret (>= 0.10.0) and, for parsing HTML, on either
9
+ Hpricot (>= 0.4) or the RubyfulSoup library (>= 1.0.4). As I know no way
10
+ to specify such an OR dependency in a gem specification, the gem depends
11
+ on Hpricot. If this is a problem for you, install the gem with --force and
12
+ manually do a +gem install rubyful_soup+.
9
13
 
10
14
  == basic usage
11
15
 
@@ -49,28 +49,21 @@ RDig.configuration do |cfg|
49
49
  # hpricot config above, and uncomment the following:
50
50
  #
51
51
  # :rubyful_soup => OpenStruct.new(
52
- # # select the html element that contains the content to index
53
- # # by default, we index all inside the body tag:
52
+ # # provide a method that returns the title of an html document
53
+ # # this method may either return a tag to extract the title from,
54
+ # # or a ready-to-index string.
54
55
  # :content_tag_selector => lambda { |tagsoup|
55
56
  # tagsoup.html.body
56
57
  # },
57
- # # select the html element containing the title
58
+ # # provide a method that selects the tag containing the page content you
59
+ # # want to index. Useful to avoid indexing common elements like navigation
60
+ # # and page footers for every page.
58
61
  # :title_tag_selector => lambda { |tagsoup|
59
62
  # tagsoup.html.head.title
60
63
  # }
61
64
  # )
62
65
  )
63
66
 
64
- # provide a method that returns the title of an html document
65
- # this method may either return a tag to extract the title from,
66
- # or a ready-to-index string.
67
- # cfg.content_extraction.html.title_tag_selector = lambda { |tagsoup| tagsoup.html.head.title }
68
-
69
- # provide a method that selects the tag containing the page content you
70
- # want to index. Useful to avoid indexing common elements like navigation
71
- # and page footers for every page.
72
- # cfg.content_extraction.html.content_tag_selector = lambda { |tagsoup| tagsoup.html.body }
73
-
74
67
  # crawler options
75
68
 
76
69
  # Notice: for file system crawling the include/exclude_document patterns are
@@ -24,7 +24,7 @@
24
24
  #++
25
25
  #
26
26
 
27
- RDIGVERSION = '0.3.3'
27
+ RDIGVERSION = '0.3.4'
28
28
 
29
29
 
30
30
  require 'thread'
@@ -16,15 +16,15 @@ module RDig
16
16
 
17
17
  def self.inherited(extractor)
18
18
  super(extractor)
19
- puts("discovered content extractor class: #{extractor}") if RDig.config.verbose
20
19
  self.extractors << extractor
21
20
  end
22
21
 
23
22
  def self.extractors; @@extractors ||= [] end
24
23
  def self.extractor_instances
25
24
  @@extractor_instances ||= extractors.map { |ex_class|
26
- ex_class.new(RDig.configuration.content_extraction)
27
- }
25
+ puts "initializing content extractor: #{ex_class}" if RDig.configuration.verbose
26
+ ex_class.new(RDig.configuration.content_extraction) rescue nil
27
+ }.compact
28
28
  end
29
29
 
30
30
  def self.process(content, content_type)
@@ -32,7 +32,6 @@ module RDig
32
32
  return extractor.process(content) if extractor.can_do(content_type)
33
33
  }
34
34
  puts "unable to handle content type #{content_type}"
35
- nil
36
35
  end
37
36
 
38
37
  def initialize(config)
@@ -78,8 +77,8 @@ end
78
77
  # load content extractors
79
78
  Dir["#{File.expand_path(File.dirname(__FILE__))}/content_extractors/**/*.rb"].each do |f|
80
79
  begin
81
- require f
82
- rescue
83
- puts "error loading #{f}: #{$!}"
80
+ require f
81
+ rescue LoadError
82
+ puts "could not load #{f}: #{$!}"
84
83
  end
85
84
  end
@@ -13,16 +13,25 @@ module RDig
13
13
  @wvhtml = 'wvHtml'
14
14
  @pattern = /^application\/msword/
15
15
  # html extractor for parsing wvHtml output
16
- @html_extractor = RubyfulSoupContentExtractor.new(OpenStruct.new(
17
- :rubyful_soup => OpenStruct.new(
18
- :content_tag_selector => lambda { |tagsoup|
19
- tagsoup.html.body
20
- },
21
- :title_tag_selector => lambda { |tagsoup|
22
- tagsoup.html.head.title
23
- }
24
- )))
25
-
16
+ if defined?(HpricotContentExtractor)
17
+ @html_extractor = HpricotContentExtractor.new(OpenStruct.new(
18
+ :hpricot => OpenStruct.new(
19
+ :content_tag_selector => 'body',
20
+ :title_tag_selector => 'title'
21
+ )))
22
+ elsif defined?(RubyfulSoupContentExtractor)
23
+ @html_extractor = RubyfulSoupContentExtractor.new(OpenStruct.new(
24
+ :rubyful_soup => OpenStruct.new(
25
+ :content_tag_selector => lambda { |tagsoup|
26
+ tagsoup.html.body
27
+ },
28
+ :title_tag_selector => lambda { |tagsoup|
29
+ tagsoup.html.head.title
30
+ }
31
+ )))
32
+ else
33
+ raise "need at least one html content extractor - please install hpricot or rubyful_soup"
34
+ end
26
35
  # TODO: better: if $?.exitstatus == 127 (not found)
27
36
  @available = %x{#{@wvhtml} -h 2>&1} =~ /Dom Lachowicz/
28
37
  end
@@ -5,6 +5,7 @@ rescue LoadError
5
5
  require 'hpricot'
6
6
  end
7
7
 
8
+ if defined?(Hpricot)
8
9
  module RDig
9
10
  module ContentExtractors
10
11
 
@@ -24,7 +25,7 @@ module RDig
24
25
  def process(content)
25
26
  doc = Hpricot(content)
26
27
  {
27
- :title => extract_title(doc).decode_entities,
28
+ :title => extract_title(doc).decode_entities.strip,
28
29
  :links => extract_links(doc),
29
30
  :content => extract_content(doc).decode_entities
30
31
  }
@@ -97,3 +98,4 @@ module RDig
97
98
 
98
99
  end
99
100
  end
101
+ end
@@ -2,8 +2,10 @@ begin
2
2
  require 'rubyful_soup'
3
3
  rescue LoadError
4
4
  require 'rubygems'
5
- require 'rubyful_soup'
5
+ require 'rubyful_soup' rescue nil
6
6
  end
7
+
8
+ if defined?(BeautifulSoup)
7
9
 
8
10
  # override some methods concered with entity resolving
9
11
  # to convert them to strings
@@ -145,3 +147,5 @@ module RDig
145
147
 
146
148
  end
147
149
  end
150
+
151
+ end
data/rakefile CHANGED
@@ -134,7 +134,7 @@ else
134
134
  # TODO: check if there is anything like 'suggested' instead of required, or
135
135
  # ORed dependencies...
136
136
  #s.add_dependency('rubyful_soup', '>= 1.0.4')
137
- #s.add_dependency('hpricot', '>= 0.4')
137
+ s.add_dependency('hpricot', '>= 0.4')
138
138
  #s.requirements << ""
139
139
 
140
140
  #### Which files are to be included in this gem? Everything! (Except CVS directories.)
metadata CHANGED
@@ -1,10 +1,10 @@
1
1
  --- !ruby/object:Gem::Specification
2
- rubygems_version: 0.9.0
2
+ rubygems_version: 0.8.11
3
3
  specification_version: 1
4
4
  name: rdig
5
5
  version: !ruby/object:Gem::Version
6
- version: 0.3.3
7
- date: 2006-10-23 00:00:00 +02:00
6
+ version: 0.3.4
7
+ date: 2006-12-31 00:00:00 +01:00
8
8
  summary: Ruby based web site indexing and searching library.
9
9
  require_paths:
10
10
  - lib
@@ -25,7 +25,6 @@ required_ruby_version: !ruby/object:Gem::Version::Requirement
25
25
  platform: ruby
26
26
  signing_key:
27
27
  cert_chain:
28
- post_install_message:
29
28
  authors:
30
29
  - Jens Kraemer
31
30
  files:
@@ -107,3 +106,12 @@ dependencies:
107
106
  - !ruby/object:Gem::Version
108
107
  version: 0.10.0
109
108
  version:
109
+ - !ruby/object:Gem::Dependency
110
+ name: hpricot
111
+ version_requirement:
112
+ version_requirements: !ruby/object:Gem::Version::Requirement
113
+ requirements:
114
+ - - ">="
115
+ - !ruby/object:Gem::Version
116
+ version: "0.4"
117
+ version: