rdig 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGES CHANGED
@@ -1,2 +1,8 @@
1
+ 0.2.0
2
+ - add pdf and Word content extraction capabilities using the tools
3
+ from the xpdf-utils and wv packages
4
+ - additional content extractors may be plugged in by extending
5
+ the ContentExtractor class
6
+
1
7
  0.1.0
2
8
  initial release
data/bin/rdig CHANGED
@@ -13,20 +13,3 @@ end
13
13
  RDig.application.run
14
14
 
15
15
 
16
- #$LOAD_PATH << File.expand_path(File.dirname(__FILE__) + "/lib")
17
- #$LOAD_PATH << File.expand_path(File.dirname(__FILE__))
18
- #require 'init'
19
-
20
- #if ARGV[0]
21
- # require ARGV[0]
22
- #else
23
- # require 'config'
24
- #end
25
-
26
- #include SiteSearch
27
-
28
-
29
- #puts "creating new index in #{SiteSearch.settings[:index_dir]}"
30
-
31
- #crawler = Crawler.new
32
- #crawler.run
data/lib/rdig.rb CHANGED
@@ -24,7 +24,7 @@
24
24
  #++
25
25
  #
26
26
 
27
- RDIGVERSION = '0.1.0'
27
+ RDIGVERSION = '0.2.0'
28
28
 
29
29
 
30
30
  require 'thread'
@@ -37,6 +37,10 @@ require 'cgi'
37
37
  require 'set'
38
38
  require 'net/http'
39
39
  require 'getoptlong'
40
+ require 'tempfile'
41
+ # mkmf gives us the handy find_executable method used to check for helper
42
+ # programs:
43
+ require 'mkmf'
40
44
 
41
45
  begin
42
46
  require 'rubyful_soup'
@@ -23,31 +23,146 @@ end
23
23
 
24
24
  module RDig
25
25
 
26
- # Contains Classes which are used for extracting content and meta data from
26
+ # Contains classes which are used for extracting content and meta data from
27
27
  # various content types.
28
- #
29
- # TODO: support at least pdf, too.
30
28
  module ContentExtractors
31
29
 
32
30
  # process the given +content+ depending on it's +content_type+.
33
- def ContentExtractors.process(content, content_type)
34
- case content_type
35
- when /^(text\/(html|xml)|application\/(xhtml\+xml|xml))/
36
- return HtmlContentExtractor.process(content)
37
- else
31
+ def self.process(content, content_type)
32
+ ContentExtractor.process(content, content_type)
33
+ # case content_type
34
+ #when /^(text\/(html|xml)|application\/(xhtml\+xml|xml))/
35
+ # return HtmlContentExtractor.process(content)
36
+ #when /^application\/.+pdf/
37
+ # return PdfContentExtractor.process(content) unless RDig.config.content_extraction.pdf.disabled
38
+ #else
39
+ # puts "unable to handle content type #{content_type}"
40
+ #end
41
+ #return nil
42
+ end
43
+
44
+ # Base class for Content Extractors.
45
+ # Extractors inheriting from this class will be auto-discovered and used
46
+ # when can_do returns true
47
+ class ContentExtractor
48
+
49
+ def self.inherited(extractor)
50
+ super(extractor)
51
+ puts("discovered content extractor class: #{extractor}")
52
+ self.extractors << extractor
53
+ end
54
+
55
+ def self.extractors; @@extractors ||= [] end
56
+ def self.extractor_instances
57
+ @@extractor_instances ||= extractors.map { |ex_class| ex_class.new }
58
+ end
59
+
60
+ def self.process(content, content_type)
61
+ self.extractor_instances.each { |extractor|
62
+ return extractor.process(content) if extractor.can_do(content_type)
63
+ }
38
64
  puts "unable to handle content type #{content_type}"
65
+ nil
66
+ end
67
+
68
+ def can_do(content_type)
69
+ content_type =~ @pattern
39
70
  end
40
- return nil
71
+ end
72
+
73
+
74
+ # to be used by concrete implementations having a get_content class method
75
+ # that takes a path to a file and return the textual content extracted from
76
+ # that file.
77
+ module ExternalAppHelper
78
+ def process(content)
79
+ result = {}
80
+ as_file(content) do |file|
81
+ result[:content] = get_content(file.path).strip
82
+ end
83
+ result
84
+ end
85
+
86
+ def as_file(content)
87
+ file = Tempfile.new('rdig')
88
+ file << content
89
+ file.close
90
+ yield file
91
+ file.delete
92
+ end
93
+
94
+ def available
95
+ if @available.nil?
96
+ @available = !find_executable(@executable).nil?
97
+ end
98
+ @available
99
+ end
100
+
101
+ def can_do(content_type)
102
+ available and super(content_type)
103
+ end
104
+ end
105
+
106
+ # Extract text from pdf content.
107
+ #
108
+ # Requires the pdftotext utility from the xpdf-utils package
109
+ # (on debian and friends do 'apt-get install xpdf-utils')
110
+ #
111
+ # TODO: use pdfinfo to get title from document
112
+ class PdfContentExtractor < ContentExtractor
113
+ include ExternalAppHelper
114
+
115
+ def initialize
116
+ @executable = 'pdftotext'
117
+ @pattern = /^application\/pdf/
118
+ end
119
+
120
+ def get_content(path_to_tempfile)
121
+ %x{#{@executable} '#{path_to_tempfile}' -}
122
+ end
123
+ end
124
+
125
+ # Extract text from word documents
126
+ #
127
+ # Requires the antiword utility
128
+ # (on debian and friends do 'apt-get install antiword')
129
+ class WordContentExtractor < ContentExtractor
130
+ include ExternalAppHelper
131
+
132
+ def initialize
133
+ @executable = 'wvHtml'
134
+ @pattern = /^application\/msword/
135
+ @html_extractor = HtmlContentExtractor.new
136
+ end
137
+
138
+ def process(content)
139
+ result = {}
140
+ as_file(content) do |infile|
141
+ outfile = Tempfile.new('rdig')
142
+ outfile.close
143
+ %x{#{@executable} --targetdir='#{File.dirname(outfile.path)}' '#{infile.path}' '#{File.basename(outfile.path)}'}
144
+ File.open(outfile.path) do |html|
145
+ result = @html_extractor.process(html.read)
146
+ end
147
+ outfile.delete
148
+ end
149
+ return result || {}
150
+ end
151
+
41
152
  end
42
153
 
43
154
  # extracts title, content and links from html documents
44
- class HtmlContentExtractor
155
+ class HtmlContentExtractor < ContentExtractor
156
+
157
+ def initialize
158
+ @pattern = /^(text\/(html|xml)|application\/(xhtml\+xml|xml))/
159
+ end
45
160
 
46
161
  # returns:
47
162
  # { :content => 'extracted clear text',
48
163
  # :meta => { :title => 'Title' },
49
164
  # :links => [array of urls] }
50
- def self.process(content)
165
+ def process(content)
51
166
  result = { }
52
167
  tag_soup = BeautifulSoup.new(content)
53
168
  result[:title] = extract_title(tag_soup)
@@ -64,7 +179,7 @@ module RDig
64
179
  # - Then, this element is processed by +extract_text+, which will give
65
180
  # all textual content contained in the root element and all it's
66
181
  # children.
67
- def self.extract_content(tag_soup)
182
+ def extract_content(tag_soup)
68
183
  content = ''
69
184
  content_element(tag_soup).children { |child|
70
185
  extract_text(child, content)
@@ -74,14 +189,14 @@ module RDig
74
189
 
75
190
  # extracts the href attributes of all a tags, except
76
191
  # internal links like <a href="#top">
77
- def self.extract_links(tagsoup)
192
+ def extract_links(tagsoup)
78
193
  tagsoup.find_all('a').map { |link|
79
194
  CGI.unescapeHTML(link['href']) if (link['href'] && link['href'] !~ /^#/)
80
195
  }.compact
81
196
  end
82
197
 
83
198
  # Extracts the title from the given html tree
84
- def self.extract_title(tagsoup)
199
+ def extract_title(tagsoup)
85
200
  title = ''
86
201
  the_title_tag = title_tag(tagsoup)
87
202
  if the_title_tag.is_a? String
@@ -93,7 +208,7 @@ module RDig
93
208
 
94
209
  # Recursively extracts all text contained in the given element,
95
210
  # and appends it to content.
96
- def self.extract_text(element, content='')
211
+ def extract_text(element, content='')
97
212
  if element.is_a? NavigableString
98
213
  value = strip_comments(element)
99
214
  value.strip!
@@ -118,7 +233,7 @@ module RDig
118
233
  #
119
234
  # This may return a string, e.g. an attribute value selected from a meta
120
235
  # tag, too.
121
- def self.title_tag(tagsoup)
236
+ def title_tag(tagsoup)
122
237
  if RDig.config.content_extraction.html.title_tag_selector
123
238
  RDig.config.content_extraction.html.title_tag_selector.call(tagsoup)
124
239
  else
@@ -127,7 +242,7 @@ module RDig
127
242
  end
128
243
 
129
244
  # Retrieve the root element to extract document content from
130
- def self.content_element(tagsoup)
245
+ def content_element(tagsoup)
131
246
  if RDig.config.content_extraction.html.content_tag_selector
132
247
  RDig.config.content_extraction.html.content_tag_selector.call(tagsoup)
133
248
  else
@@ -136,7 +251,7 @@ module RDig
136
251
  end
137
252
 
138
253
  # Return the given string minus all html comments
139
- def self.strip_comments(string)
254
+ def strip_comments(string)
140
255
  string.gsub(Regexp.new('<!--.*?-->', Regexp::MULTILINE, 'u'), '')
141
256
  end
142
257
  end
data/rakefile CHANGED
@@ -94,7 +94,7 @@ Rake::TestTask.new("test_functional") { |t|
94
94
  # Generate the RDoc documentation ----------------------------------------
95
95
 
96
96
  rd = Rake::RDocTask.new { |rdoc|
97
- rdoc.rdoc_dir = 'doc/html'
97
+ rdoc.rdoc_dir = 'html'
98
98
  rdoc.title = "RDig - Ferret based full text search for web sites"
99
99
  rdoc.options << '--line-numbers' << '--inline-source'
100
100
  rdoc.options << '--main' << 'README'
@@ -323,3 +323,8 @@ task :tag => [:prerelease] do
323
323
  end
324
324
  end
325
325
 
326
+ # Publish RDocs ------------------------------------------------------
327
+ desc "Publish the API documentation"
328
+ task :pdoc => [:rdoc] do
329
+ Rake::RubyForgePublisher.new(RUBY_FORGE_PROJECT, RUBY_FORGE_USER).upload
330
+ end
Binary file
Binary file
data/test/test_helper.rb CHANGED
@@ -12,6 +12,12 @@ module TestHelper
12
12
  }
13
13
  end
14
14
 
15
+ def word_doc(name)
16
+ read_fixture("word/#{name}.doc")
17
+ end
18
+ def pdf_doc(name)
19
+ read_fixture("pdf/#{name}.pdf")
20
+ end
15
21
  def html_doc(name)
16
22
  read_fixture("html/#{name}.html")
17
23
  end
@@ -3,7 +3,7 @@ class HtmlContentExtractorTest < Test::Unit::TestCase
3
3
  include TestHelper
4
4
 
5
5
  def setup
6
- @extractor = ContentExtractors::HtmlContentExtractor
6
+ @extractor = ContentExtractors::HtmlContentExtractor.new
7
7
  @nbsp = [160].pack('U') # non breaking space
8
8
  @config_backup = RDig.config.content_extraction.html.clone
9
9
  end
@@ -12,8 +12,17 @@ class HtmlContentExtractorTest < Test::Unit::TestCase
12
12
  RDig.config.content_extraction.html = @config_backup
13
13
  end
14
14
 
15
+ def test_can_do
16
+ assert !@extractor.can_do('application/pdf')
17
+ assert !@extractor.can_do('application/msword')
18
+ assert @extractor.can_do('text/html')
19
+ assert @extractor.can_do('text/xml')
20
+ assert @extractor.can_do('application/xml')
21
+ assert @extractor.can_do('application/xhtml+xml')
22
+ end
23
+
15
24
  def test_simple
16
- result = @extractor.process(html_doc('simple'))
25
+ result = ContentExtractors.process(html_doc('simple'), 'text/html')
17
26
  assert_not_nil result
18
27
  assert_equal 'Sample Title', result[:title]
19
28
  assert_not_nil result[:content]
@@ -0,0 +1,33 @@
1
+ require 'test_helper'
2
+ class PdfContentExtractorTest < Test::Unit::TestCase
3
+ include TestHelper
4
+
5
+ def setup
6
+ @ce = ContentExtractors::PdfContentExtractor.new
7
+ end
8
+
9
+ def test_can_do
10
+ assert @ce.can_do('application/pdf')
11
+ assert !@ce.can_do('application/msword')
12
+ end
13
+ def test_simple_with_ctype
14
+ result = ContentExtractors.process(pdf_doc('simple'), 'application/pdf')
15
+ check_content(result)
16
+ end
17
+
18
+ def test_simple
19
+ result = @ce.process(pdf_doc('simple'))
20
+ check_content(result)
21
+ end
22
+
23
+ private
24
+ def check_content(result)
25
+ assert_not_nil result
26
+ assert_nil result[:title]
27
+ assert_nil result[:links]
28
+ assert_not_nil result[:content]
29
+ assert_equal 'This is for testing PDF extraction. Another Paragraph.', result[:content]
30
+ end
31
+
32
+ end
33
+
@@ -0,0 +1,34 @@
1
+ require 'test_helper'
2
+ class WordContentExtractorTest < Test::Unit::TestCase
3
+ include TestHelper
4
+
5
+ def setup
6
+ @ce = ContentExtractors::WordContentExtractor.new
7
+ end
8
+
9
+ def test_can_do
10
+ assert !@ce.can_do('application/pdf')
11
+ assert @ce.can_do('application/msword')
12
+ end
13
+ def test_simple_with_ctype
14
+ result = ContentExtractors.process(word_doc('simple'), 'application/msword')
15
+ check_content(result)
16
+ end
17
+
18
+ def test_simple
19
+ result = @ce.process(word_doc('simple'))
20
+ check_content(result)
21
+ end
22
+
23
+ private
24
+ def check_content(result)
25
+ assert_not_nil result
26
+ assert_equal [], result[:links]
27
+ assert_not_nil result[:title]
28
+ assert_equal 'Untitled', result[:title]
29
+ assert_not_nil result[:content]
30
+ assert_equal 'Test content for Word content extraction. Another paragraph.', result[:content]
31
+ end
32
+
33
+ end
34
+
metadata CHANGED
@@ -1,102 +1,107 @@
1
1
  --- !ruby/object:Gem::Specification
2
- rubygems_version: 0.8.10
2
+ rubygems_version: 0.8.11
3
3
  specification_version: 1
4
4
  name: rdig
5
5
  version: !ruby/object:Gem::Version
6
- version: 0.1.0
7
- date: 2006-03-25
6
+ version: 0.2.0
7
+ date: 2006-04-19 00:00:00 +02:00
8
8
  summary: Ruby based web site indexing and searching library.
9
9
  require_paths:
10
- - lib
10
+ - lib
11
11
  email: jk@jkraemer.net
12
12
  homepage: http://rdig.rubyforge.org/
13
13
  rubyforge_project: rdig
14
- description: "RDig provides an HTTP crawler and content extraction utilities to help building
15
- a site search for web sites or intranets. Internally, Ferret is used for the
16
- full text indexing. After creating a config file for your site, the index can
17
- be built with a single call to rdig."
14
+ description: RDig provides an HTTP crawler and content extraction utilities to help building a site search for web sites or intranets. Internally, Ferret is used for the full text indexing. After creating a config file for your site, the index can be built with a single call to rdig.
18
15
  autorequire:
19
16
  default_executable: rdig
20
17
  bindir: bin
21
18
  has_rdoc: true
22
19
  required_ruby_version: !ruby/object:Gem::Version::Requirement
23
20
  requirements:
24
- -
25
- - ">"
26
- - !ruby/object:Gem::Version
27
- version: 0.0.0
21
+ - - ">"
22
+ - !ruby/object:Gem::Version
23
+ version: 0.0.0
28
24
  version:
29
25
  platform: ruby
26
+ signing_key:
27
+ cert_chain:
30
28
  authors:
31
- - Jens Kraemer
29
+ - Jens Kraemer
32
30
  files:
33
- - bin/rdig
34
- - lib/rdig
35
- - lib/htmlentities
36
- - lib/rdig.rb
37
- - lib/rdig/http_client.rb
38
- - lib/rdig/crawler.rb
39
- - lib/rdig/search.rb
40
- - lib/rdig/highlight.rb
41
- - lib/rdig/index.rb
42
- - lib/rdig/url_filters.rb
43
- - lib/rdig/content_extractors.rb
44
- - lib/htmlentities/CHANGES
45
- - lib/htmlentities/COPYING
46
- - lib/htmlentities/README
47
- - lib/htmlentities/htmlentities.rb
48
- - test/unit
49
- - test/fixtures
50
- - test/test_helper.rb
51
- - test/unit/etag_filter_test.rb
52
- - test/unit/url_filters_test.rb
53
- - test/unit/html_content_extractor_test.rb
54
- - test/fixtures/html
55
- - test/fixtures/html/entities.html
56
- - test/fixtures/html/simple.html
57
- - test/fixtures/html/custom_tag_selectors.html
58
- - doc/examples
59
- - doc/examples/config.rb
60
- - LICENSE
61
- - TODO
62
- - CHANGES
63
- - README
64
- - install.rb
65
- - rakefile
31
+ - bin/rdig
32
+ - lib/rdig
33
+ - lib/htmlentities
34
+ - lib/rdig.rb
35
+ - lib/rdig/http_client.rb
36
+ - lib/rdig/crawler.rb
37
+ - lib/rdig/search.rb
38
+ - lib/rdig/highlight.rb
39
+ - lib/rdig/index.rb
40
+ - lib/rdig/url_filters.rb
41
+ - lib/rdig/content_extractors.rb
42
+ - lib/htmlentities/CHANGES
43
+ - lib/htmlentities/COPYING
44
+ - lib/htmlentities/README
45
+ - lib/htmlentities/htmlentities.rb
46
+ - test/unit
47
+ - test/fixtures
48
+ - test/test_helper.rb
49
+ - test/unit/etag_filter_test.rb
50
+ - test/unit/url_filters_test.rb
51
+ - test/unit/html_content_extractor_test.rb
52
+ - test/unit/pdf_content_extractor_test.rb
53
+ - test/unit/word_content_extractor_test.rb
54
+ - test/fixtures/html
55
+ - test/fixtures/pdf
56
+ - test/fixtures/word
57
+ - test/fixtures/html/entities.html
58
+ - test/fixtures/html/simple.html
59
+ - test/fixtures/html/custom_tag_selectors.html
60
+ - test/fixtures/pdf/simple.pdf
61
+ - test/fixtures/word/simple.doc
62
+ - doc/examples
63
+ - doc/examples/config.rb
64
+ - LICENSE
65
+ - TODO
66
+ - CHANGES
67
+ - README
68
+ - install.rb
69
+ - rakefile
66
70
  test_files: []
71
+
67
72
  rdoc_options:
68
- - "--title"
69
- - "Rake -- Ruby Make"
70
- - "--main"
71
- - README
72
- - "--line-numbers"
73
+ - --title
74
+ - Rake -- Ruby Make
75
+ - --main
76
+ - README
77
+ - --line-numbers
73
78
  extra_rdoc_files:
74
- - README
75
- - CHANGES
76
- - LICENSE
77
- - TODO
79
+ - README
80
+ - CHANGES
81
+ - LICENSE
82
+ - TODO
78
83
  executables:
79
- - rdig
84
+ - rdig
80
85
  extensions: []
86
+
81
87
  requirements: []
88
+
82
89
  dependencies:
83
- - !ruby/object:Gem::Dependency
84
- name: ferret
85
- version_requirement:
86
- version_requirements: !ruby/object:Gem::Version::Requirement
87
- requirements:
88
- -
89
- - ">="
90
- - !ruby/object:Gem::Version
91
- version: 0.3.2
92
- version:
93
- - !ruby/object:Gem::Dependency
94
- name: rubyful_soup
95
- version_requirement:
96
- version_requirements: !ruby/object:Gem::Version::Requirement
97
- requirements:
98
- -
99
- - ">="
100
- - !ruby/object:Gem::Version
101
- version: 1.0.4
102
- version:
90
+ - !ruby/object:Gem::Dependency
91
+ name: ferret
92
+ version_requirement:
93
+ version_requirements: !ruby/object:Gem::Version::Requirement
94
+ requirements:
95
+ - - ">="
96
+ - !ruby/object:Gem::Version
97
+ version: 0.3.2
98
+ version:
99
+ - !ruby/object:Gem::Dependency
100
+ name: rubyful_soup
101
+ version_requirement:
102
+ version_requirements: !ruby/object:Gem::Version::Requirement
103
+ requirements:
104
+ - - ">="
105
+ - !ruby/object:Gem::Version
106
+ version: 1.0.4
107
+ version: