rdig 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGES CHANGED
@@ -1,2 +1,8 @@
1
+ 0.2.0
2
+ - add pdf and Word content extraction capabilities using the tools
3
+ from the xpdf-utils and wv packages
4
+ - additional content extractors may be plugged in by extending
5
+ the ContentExtractor class
6
+
1
7
  0.1.0
2
8
  initial release
data/bin/rdig CHANGED
@@ -13,20 +13,3 @@ end
13
13
  RDig.application.run
14
14
 
15
15
 
16
- #$LOAD_PATH << File.expand_path(File.dirname(__FILE__) + "/lib")
17
- #$LOAD_PATH << File.expand_path(File.dirname(__FILE__))
18
- #require 'init'
19
-
20
- #if ARGV[0]
21
- # require ARGV[0]
22
- #else
23
- # require 'config'
24
- #end
25
-
26
- #include SiteSearch
27
-
28
-
29
- #puts "creating new index in #{SiteSearch.settings[:index_dir]}"
30
-
31
- #crawler = Crawler.new
32
- #crawler.run
data/lib/rdig.rb CHANGED
@@ -24,7 +24,7 @@
24
24
  #++
25
25
  #
26
26
 
27
- RDIGVERSION = '0.1.0'
27
+ RDIGVERSION = '0.2.0'
28
28
 
29
29
 
30
30
  require 'thread'
@@ -37,6 +37,10 @@ require 'cgi'
37
37
  require 'set'
38
38
  require 'net/http'
39
39
  require 'getoptlong'
40
+ require 'tempfile'
41
+ # mkmf gives us the handy find_executable method used to check for helper
42
+ # programs:
43
+ require 'mkmf'
40
44
 
41
45
  begin
42
46
  require 'rubyful_soup'
@@ -23,31 +23,146 @@ end
23
23
 
24
24
  module RDig
25
25
 
26
- # Contains Classes which are used for extracting content and meta data from
26
+ # Contains classes which are used for extracting content and meta data from
27
27
  # various content types.
28
- #
29
- # TODO: support at least pdf, too.
30
28
  module ContentExtractors
31
29
 
32
30
  # process the given +content+ depending on it's +content_type+.
33
- def ContentExtractors.process(content, content_type)
34
- case content_type
35
- when /^(text\/(html|xml)|application\/(xhtml\+xml|xml))/
36
- return HtmlContentExtractor.process(content)
37
- else
31
+ def self.process(content, content_type)
32
+ ContentExtractor.process(content, content_type)
33
+ # case content_type
34
+ #when /^(text\/(html|xml)|application\/(xhtml\+xml|xml))/
35
+ # return HtmlContentExtractor.process(content)
36
+ #when /^application\/.+pdf/
37
+ # return PdfContentExtractor.process(content) unless RDig.config.content_extraction.pdf.disabled
38
+ #else
39
+ # puts "unable to handle content type #{content_type}"
40
+ #end
41
+ #return nil
42
+ end
43
+
44
+ # Base class for Content Extractors.
45
+ # Extractors inheriting from this class will be auto-discovered and used
46
+ # when can_do returns true
47
+ class ContentExtractor
48
+
49
+ def self.inherited(extractor)
50
+ super(extractor)
51
+ puts("discovered content extractor class: #{extractor}")
52
+ self.extractors << extractor
53
+ end
54
+
55
+ def self.extractors; @@extractors ||= [] end
56
+ def self.extractor_instances
57
+ @@extractor_instances ||= extractors.map { |ex_class| ex_class.new }
58
+ end
59
+
60
+ def self.process(content, content_type)
61
+ self.extractor_instances.each { |extractor|
62
+ return extractor.process(content) if extractor.can_do(content_type)
63
+ }
38
64
  puts "unable to handle content type #{content_type}"
65
+ nil
66
+ end
67
+
68
+ def can_do(content_type)
69
+ content_type =~ @pattern
39
70
  end
40
- return nil
71
+ end
72
+
73
+
74
+ # to be used by concrete implementations having a get_content class method
75
+ # that takes a path to a file and return the textual content extracted from
76
+ # that file.
77
+ module ExternalAppHelper
78
+ def process(content)
79
+ result = {}
80
+ as_file(content) do |file|
81
+ result[:content] = get_content(file.path).strip
82
+ end
83
+ result
84
+ end
85
+
86
+ def as_file(content)
87
+ file = Tempfile.new('rdig')
88
+ file << content
89
+ file.close
90
+ yield file
91
+ file.delete
92
+ end
93
+
94
+ def available
95
+ if @available.nil?
96
+ @available = !find_executable(@executable).nil?
97
+ end
98
+ @available
99
+ end
100
+
101
+ def can_do(content_type)
102
+ available and super(content_type)
103
+ end
104
+ end
105
+
106
+ # Extract text from pdf content.
107
+ #
108
+ # Requires the pdftotext utility from the xpdf-utils package
109
+ # (on debian and friends do 'apt-get install xpdf-utils')
110
+ #
111
+ # TODO: use pdfinfo to get title from document
112
+ class PdfContentExtractor < ContentExtractor
113
+ include ExternalAppHelper
114
+
115
+ def initialize
116
+ @executable = 'pdftotext'
117
+ @pattern = /^application\/pdf/
118
+ end
119
+
120
+ def get_content(path_to_tempfile)
121
+ %x{#{@executable} '#{path_to_tempfile}' -}
122
+ end
123
+ end
124
+
125
+ # Extract text from word documents
126
+ #
127
+ # Requires the antiword utility
128
+ # (on debian and friends do 'apt-get install antiword')
129
+ class WordContentExtractor < ContentExtractor
130
+ include ExternalAppHelper
131
+
132
+ def initialize
133
+ @executable = 'wvHtml'
134
+ @pattern = /^application\/msword/
135
+ @html_extractor = HtmlContentExtractor.new
136
+ end
137
+
138
+ def process(content)
139
+ result = {}
140
+ as_file(content) do |infile|
141
+ outfile = Tempfile.new('rdig')
142
+ outfile.close
143
+ %x{#{@executable} --targetdir='#{File.dirname(outfile.path)}' '#{infile.path}' '#{File.basename(outfile.path)}'}
144
+ File.open(outfile.path) do |html|
145
+ result = @html_extractor.process(html.read)
146
+ end
147
+ outfile.delete
148
+ end
149
+ return result || {}
150
+ end
151
+
41
152
  end
42
153
 
43
154
  # extracts title, content and links from html documents
44
- class HtmlContentExtractor
155
+ class HtmlContentExtractor < ContentExtractor
156
+
157
+ def initialize
158
+ @pattern = /^(text\/(html|xml)|application\/(xhtml\+xml|xml))/
159
+ end
45
160
 
46
161
  # returns:
47
162
  # { :content => 'extracted clear text',
48
163
  # :meta => { :title => 'Title' },
49
164
  # :links => [array of urls] }
50
- def self.process(content)
165
+ def process(content)
51
166
  result = { }
52
167
  tag_soup = BeautifulSoup.new(content)
53
168
  result[:title] = extract_title(tag_soup)
@@ -64,7 +179,7 @@ module RDig
64
179
  # - Then, this element is processed by +extract_text+, which will give
65
180
  # all textual content contained in the root element and all it's
66
181
  # children.
67
- def self.extract_content(tag_soup)
182
+ def extract_content(tag_soup)
68
183
  content = ''
69
184
  content_element(tag_soup).children { |child|
70
185
  extract_text(child, content)
@@ -74,14 +189,14 @@ module RDig
74
189
 
75
190
  # extracts the href attributes of all a tags, except
76
191
  # internal links like <a href="#top">
77
- def self.extract_links(tagsoup)
192
+ def extract_links(tagsoup)
78
193
  tagsoup.find_all('a').map { |link|
79
194
  CGI.unescapeHTML(link['href']) if (link['href'] && link['href'] !~ /^#/)
80
195
  }.compact
81
196
  end
82
197
 
83
198
  # Extracts the title from the given html tree
84
- def self.extract_title(tagsoup)
199
+ def extract_title(tagsoup)
85
200
  title = ''
86
201
  the_title_tag = title_tag(tagsoup)
87
202
  if the_title_tag.is_a? String
@@ -93,7 +208,7 @@ module RDig
93
208
 
94
209
  # Recursively extracts all text contained in the given element,
95
210
  # and appends it to content.
96
- def self.extract_text(element, content='')
211
+ def extract_text(element, content='')
97
212
  if element.is_a? NavigableString
98
213
  value = strip_comments(element)
99
214
  value.strip!
@@ -118,7 +233,7 @@ module RDig
118
233
  #
119
234
  # This may return a string, e.g. an attribute value selected from a meta
120
235
  # tag, too.
121
- def self.title_tag(tagsoup)
236
+ def title_tag(tagsoup)
122
237
  if RDig.config.content_extraction.html.title_tag_selector
123
238
  RDig.config.content_extraction.html.title_tag_selector.call(tagsoup)
124
239
  else
@@ -127,7 +242,7 @@ module RDig
127
242
  end
128
243
 
129
244
  # Retrieve the root element to extract document content from
130
- def self.content_element(tagsoup)
245
+ def content_element(tagsoup)
131
246
  if RDig.config.content_extraction.html.content_tag_selector
132
247
  RDig.config.content_extraction.html.content_tag_selector.call(tagsoup)
133
248
  else
@@ -136,7 +251,7 @@ module RDig
136
251
  end
137
252
 
138
253
  # Return the given string minus all html comments
139
- def self.strip_comments(string)
254
+ def strip_comments(string)
140
255
  string.gsub(Regexp.new('<!--.*?-->', Regexp::MULTILINE, 'u'), '')
141
256
  end
142
257
  end
data/rakefile CHANGED
@@ -94,7 +94,7 @@ Rake::TestTask.new("test_functional") { |t|
94
94
  # Generate the RDoc documentation ----------------------------------------
95
95
 
96
96
  rd = Rake::RDocTask.new { |rdoc|
97
- rdoc.rdoc_dir = 'doc/html'
97
+ rdoc.rdoc_dir = 'html'
98
98
  rdoc.title = "RDig - Ferret based full text search for web sites"
99
99
  rdoc.options << '--line-numbers' << '--inline-source'
100
100
  rdoc.options << '--main' << 'README'
@@ -323,3 +323,8 @@ task :tag => [:prerelease] do
323
323
  end
324
324
  end
325
325
 
326
+ # Publish RDocs ------------------------------------------------------
327
+ desc "Publish the API documentation"
328
+ task :pdoc => [:rdoc] do
329
+ Rake::RubyForgePublisher.new(RUBY_FORGE_PROJECT, RUBY_FORGE_USER).upload
330
+ end
Binary file
Binary file
data/test/test_helper.rb CHANGED
@@ -12,6 +12,12 @@ module TestHelper
12
12
  }
13
13
  end
14
14
 
15
+ def word_doc(name)
16
+ read_fixture("word/#{name}.doc")
17
+ end
18
+ def pdf_doc(name)
19
+ read_fixture("pdf/#{name}.pdf")
20
+ end
15
21
  def html_doc(name)
16
22
  read_fixture("html/#{name}.html")
17
23
  end
@@ -3,7 +3,7 @@ class HtmlContentExtractorTest < Test::Unit::TestCase
3
3
  include TestHelper
4
4
 
5
5
  def setup
6
- @extractor = ContentExtractors::HtmlContentExtractor
6
+ @extractor = ContentExtractors::HtmlContentExtractor.new
7
7
  @nbsp = [160].pack('U') # non breaking space
8
8
  @config_backup = RDig.config.content_extraction.html.clone
9
9
  end
@@ -12,8 +12,17 @@ class HtmlContentExtractorTest < Test::Unit::TestCase
12
12
  RDig.config.content_extraction.html = @config_backup
13
13
  end
14
14
 
15
+ def test_can_do
16
+ assert !@extractor.can_do('application/pdf')
17
+ assert !@extractor.can_do('application/msword')
18
+ assert @extractor.can_do('text/html')
19
+ assert @extractor.can_do('text/xml')
20
+ assert @extractor.can_do('application/xml')
21
+ assert @extractor.can_do('application/xhtml+xml')
22
+ end
23
+
15
24
  def test_simple
16
- result = @extractor.process(html_doc('simple'))
25
+ result = ContentExtractors.process(html_doc('simple'), 'text/html')
17
26
  assert_not_nil result
18
27
  assert_equal 'Sample Title', result[:title]
19
28
  assert_not_nil result[:content]
@@ -0,0 +1,33 @@
1
+ require 'test_helper'
2
+ class PdfContentExtractorTest < Test::Unit::TestCase
3
+ include TestHelper
4
+
5
+ def setup
6
+ @ce = ContentExtractors::PdfContentExtractor.new
7
+ end
8
+
9
+ def test_can_do
10
+ assert @ce.can_do('application/pdf')
11
+ assert !@ce.can_do('application/msword')
12
+ end
13
+ def test_simple_with_ctype
14
+ result = ContentExtractors.process(pdf_doc('simple'), 'application/pdf')
15
+ check_content(result)
16
+ end
17
+
18
+ def test_simple
19
+ result = @ce.process(pdf_doc('simple'))
20
+ check_content(result)
21
+ end
22
+
23
+ private
24
+ def check_content(result)
25
+ assert_not_nil result
26
+ assert_nil result[:title]
27
+ assert_nil result[:links]
28
+ assert_not_nil result[:content]
29
+ assert_equal 'This is for testing PDF extraction. Another Paragraph.', result[:content]
30
+ end
31
+
32
+ end
33
+
@@ -0,0 +1,34 @@
1
+ require 'test_helper'
2
+ class WordContentExtractorTest < Test::Unit::TestCase
3
+ include TestHelper
4
+
5
+ def setup
6
+ @ce = ContentExtractors::WordContentExtractor.new
7
+ end
8
+
9
+ def test_can_do
10
+ assert !@ce.can_do('application/pdf')
11
+ assert @ce.can_do('application/msword')
12
+ end
13
+ def test_simple_with_ctype
14
+ result = ContentExtractors.process(word_doc('simple'), 'application/msword')
15
+ check_content(result)
16
+ end
17
+
18
+ def test_simple
19
+ result = @ce.process(word_doc('simple'))
20
+ check_content(result)
21
+ end
22
+
23
+ private
24
+ def check_content(result)
25
+ assert_not_nil result
26
+ assert_equal [], result[:links]
27
+ assert_not_nil result[:title]
28
+ assert_equal 'Untitled', result[:title]
29
+ assert_not_nil result[:content]
30
+ assert_equal 'Test content for Word content extraction. Another paragraph.', result[:content]
31
+ end
32
+
33
+ end
34
+
metadata CHANGED
@@ -1,102 +1,107 @@
1
1
  --- !ruby/object:Gem::Specification
2
- rubygems_version: 0.8.10
2
+ rubygems_version: 0.8.11
3
3
  specification_version: 1
4
4
  name: rdig
5
5
  version: !ruby/object:Gem::Version
6
- version: 0.1.0
7
- date: 2006-03-25
6
+ version: 0.2.0
7
+ date: 2006-04-19 00:00:00 +02:00
8
8
  summary: Ruby based web site indexing and searching library.
9
9
  require_paths:
10
- - lib
10
+ - lib
11
11
  email: jk@jkraemer.net
12
12
  homepage: http://rdig.rubyforge.org/
13
13
  rubyforge_project: rdig
14
- description: "RDig provides an HTTP crawler and content extraction utilities to help building
15
- a site search for web sites or intranets. Internally, Ferret is used for the
16
- full text indexing. After creating a config file for your site, the index can
17
- be built with a single call to rdig."
14
+ description: RDig provides an HTTP crawler and content extraction utilities to help building a site search for web sites or intranets. Internally, Ferret is used for the full text indexing. After creating a config file for your site, the index can be built with a single call to rdig.
18
15
  autorequire:
19
16
  default_executable: rdig
20
17
  bindir: bin
21
18
  has_rdoc: true
22
19
  required_ruby_version: !ruby/object:Gem::Version::Requirement
23
20
  requirements:
24
- -
25
- - ">"
26
- - !ruby/object:Gem::Version
27
- version: 0.0.0
21
+ - - ">"
22
+ - !ruby/object:Gem::Version
23
+ version: 0.0.0
28
24
  version:
29
25
  platform: ruby
26
+ signing_key:
27
+ cert_chain:
30
28
  authors:
31
- - Jens Kraemer
29
+ - Jens Kraemer
32
30
  files:
33
- - bin/rdig
34
- - lib/rdig
35
- - lib/htmlentities
36
- - lib/rdig.rb
37
- - lib/rdig/http_client.rb
38
- - lib/rdig/crawler.rb
39
- - lib/rdig/search.rb
40
- - lib/rdig/highlight.rb
41
- - lib/rdig/index.rb
42
- - lib/rdig/url_filters.rb
43
- - lib/rdig/content_extractors.rb
44
- - lib/htmlentities/CHANGES
45
- - lib/htmlentities/COPYING
46
- - lib/htmlentities/README
47
- - lib/htmlentities/htmlentities.rb
48
- - test/unit
49
- - test/fixtures
50
- - test/test_helper.rb
51
- - test/unit/etag_filter_test.rb
52
- - test/unit/url_filters_test.rb
53
- - test/unit/html_content_extractor_test.rb
54
- - test/fixtures/html
55
- - test/fixtures/html/entities.html
56
- - test/fixtures/html/simple.html
57
- - test/fixtures/html/custom_tag_selectors.html
58
- - doc/examples
59
- - doc/examples/config.rb
60
- - LICENSE
61
- - TODO
62
- - CHANGES
63
- - README
64
- - install.rb
65
- - rakefile
31
+ - bin/rdig
32
+ - lib/rdig
33
+ - lib/htmlentities
34
+ - lib/rdig.rb
35
+ - lib/rdig/http_client.rb
36
+ - lib/rdig/crawler.rb
37
+ - lib/rdig/search.rb
38
+ - lib/rdig/highlight.rb
39
+ - lib/rdig/index.rb
40
+ - lib/rdig/url_filters.rb
41
+ - lib/rdig/content_extractors.rb
42
+ - lib/htmlentities/CHANGES
43
+ - lib/htmlentities/COPYING
44
+ - lib/htmlentities/README
45
+ - lib/htmlentities/htmlentities.rb
46
+ - test/unit
47
+ - test/fixtures
48
+ - test/test_helper.rb
49
+ - test/unit/etag_filter_test.rb
50
+ - test/unit/url_filters_test.rb
51
+ - test/unit/html_content_extractor_test.rb
52
+ - test/unit/pdf_content_extractor_test.rb
53
+ - test/unit/word_content_extractor_test.rb
54
+ - test/fixtures/html
55
+ - test/fixtures/pdf
56
+ - test/fixtures/word
57
+ - test/fixtures/html/entities.html
58
+ - test/fixtures/html/simple.html
59
+ - test/fixtures/html/custom_tag_selectors.html
60
+ - test/fixtures/pdf/simple.pdf
61
+ - test/fixtures/word/simple.doc
62
+ - doc/examples
63
+ - doc/examples/config.rb
64
+ - LICENSE
65
+ - TODO
66
+ - CHANGES
67
+ - README
68
+ - install.rb
69
+ - rakefile
66
70
  test_files: []
71
+
67
72
  rdoc_options:
68
- - "--title"
69
- - "Rake -- Ruby Make"
70
- - "--main"
71
- - README
72
- - "--line-numbers"
73
+ - --title
74
+ - Rake -- Ruby Make
75
+ - --main
76
+ - README
77
+ - --line-numbers
73
78
  extra_rdoc_files:
74
- - README
75
- - CHANGES
76
- - LICENSE
77
- - TODO
79
+ - README
80
+ - CHANGES
81
+ - LICENSE
82
+ - TODO
78
83
  executables:
79
- - rdig
84
+ - rdig
80
85
  extensions: []
86
+
81
87
  requirements: []
88
+
82
89
  dependencies:
83
- - !ruby/object:Gem::Dependency
84
- name: ferret
85
- version_requirement:
86
- version_requirements: !ruby/object:Gem::Version::Requirement
87
- requirements:
88
- -
89
- - ">="
90
- - !ruby/object:Gem::Version
91
- version: 0.3.2
92
- version:
93
- - !ruby/object:Gem::Dependency
94
- name: rubyful_soup
95
- version_requirement:
96
- version_requirements: !ruby/object:Gem::Version::Requirement
97
- requirements:
98
- -
99
- - ">="
100
- - !ruby/object:Gem::Version
101
- version: 1.0.4
102
- version:
90
+ - !ruby/object:Gem::Dependency
91
+ name: ferret
92
+ version_requirement:
93
+ version_requirements: !ruby/object:Gem::Version::Requirement
94
+ requirements:
95
+ - - ">="
96
+ - !ruby/object:Gem::Version
97
+ version: 0.3.2
98
+ version:
99
+ - !ruby/object:Gem::Dependency
100
+ name: rubyful_soup
101
+ version_requirement:
102
+ version_requirements: !ruby/object:Gem::Version::Requirement
103
+ requirements:
104
+ - - ">="
105
+ - !ruby/object:Gem::Version
106
+ version: 1.0.4
107
+ version: