rdig 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGES +6 -0
- data/bin/rdig +0 -17
- data/lib/rdig.rb +5 -1
- data/lib/rdig/content_extractors.rb +133 -18
- data/rakefile +6 -1
- data/test/fixtures/pdf/simple.pdf +0 -0
- data/test/fixtures/word/simple.doc +0 -0
- data/test/test_helper.rb +6 -0
- data/test/unit/html_content_extractor_test.rb +11 -2
- data/test/unit/pdf_content_extractor_test.rb +33 -0
- data/test/unit/word_content_extractor_test.rb +34 -0
- metadata +81 -76
data/CHANGES
CHANGED
data/bin/rdig
CHANGED
@@ -13,20 +13,3 @@ end
|
|
13
13
|
RDig.application.run
|
14
14
|
|
15
15
|
|
16
|
-
#$LOAD_PATH << File.expand_path(File.dirname(__FILE__) + "/lib")
|
17
|
-
#$LOAD_PATH << File.expand_path(File.dirname(__FILE__))
|
18
|
-
#require 'init'
|
19
|
-
|
20
|
-
#if ARGV[0]
|
21
|
-
# require ARGV[0]
|
22
|
-
#else
|
23
|
-
# require 'config'
|
24
|
-
#end
|
25
|
-
|
26
|
-
#include SiteSearch
|
27
|
-
|
28
|
-
|
29
|
-
#puts "creating new index in #{SiteSearch.settings[:index_dir]}"
|
30
|
-
|
31
|
-
#crawler = Crawler.new
|
32
|
-
#crawler.run
|
data/lib/rdig.rb
CHANGED
@@ -24,7 +24,7 @@
|
|
24
24
|
#++
|
25
25
|
#
|
26
26
|
|
27
|
-
RDIGVERSION = '0.
|
27
|
+
RDIGVERSION = '0.2.0'
|
28
28
|
|
29
29
|
|
30
30
|
require 'thread'
|
@@ -37,6 +37,10 @@ require 'cgi'
|
|
37
37
|
require 'set'
|
38
38
|
require 'net/http'
|
39
39
|
require 'getoptlong'
|
40
|
+
require 'tempfile'
|
41
|
+
# mkmf gives us the handy find_executable method used to check for helper
|
42
|
+
# programs:
|
43
|
+
require 'mkmf'
|
40
44
|
|
41
45
|
begin
|
42
46
|
require 'rubyful_soup'
|
@@ -23,31 +23,146 @@ end
|
|
23
23
|
|
24
24
|
module RDig
|
25
25
|
|
26
|
-
# Contains
|
26
|
+
# Contains classes which are used for extracting content and meta data from
|
27
27
|
# various content types.
|
28
|
-
#
|
29
|
-
# TODO: support at least pdf, too.
|
30
28
|
module ContentExtractors
|
31
29
|
|
32
30
|
# process the given +content+ depending on it's +content_type+.
|
33
|
-
def
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
31
|
+
def self.process(content, content_type)
|
32
|
+
ContentExtractor.process(content, content_type)
|
33
|
+
# case content_type
|
34
|
+
#when /^(text\/(html|xml)|application\/(xhtml\+xml|xml))/
|
35
|
+
# return HtmlContentExtractor.process(content)
|
36
|
+
#when /^application\/.+pdf/
|
37
|
+
# return PdfContentExtractor.process(content) unless RDig.config.content_extraction.pdf.disabled
|
38
|
+
#else
|
39
|
+
# puts "unable to handle content type #{content_type}"
|
40
|
+
#end
|
41
|
+
#return nil
|
42
|
+
end
|
43
|
+
|
44
|
+
# Base class for Content Extractors.
|
45
|
+
# Extractors inheriting from this class will be auto-discovered and used
|
46
|
+
# when can_do returns true
|
47
|
+
class ContentExtractor
|
48
|
+
|
49
|
+
def self.inherited(extractor)
|
50
|
+
super(extractor)
|
51
|
+
puts("discovered content extractor class: #{extractor}")
|
52
|
+
self.extractors << extractor
|
53
|
+
end
|
54
|
+
|
55
|
+
def self.extractors; @@extractors ||= [] end
|
56
|
+
def self.extractor_instances
|
57
|
+
@@extractor_instances ||= extractors.map { |ex_class| ex_class.new }
|
58
|
+
end
|
59
|
+
|
60
|
+
def self.process(content, content_type)
|
61
|
+
self.extractor_instances.each { |extractor|
|
62
|
+
return extractor.process(content) if extractor.can_do(content_type)
|
63
|
+
}
|
38
64
|
puts "unable to handle content type #{content_type}"
|
65
|
+
nil
|
66
|
+
end
|
67
|
+
|
68
|
+
def can_do(content_type)
|
69
|
+
content_type =~ @pattern
|
39
70
|
end
|
40
|
-
|
71
|
+
end
|
72
|
+
|
73
|
+
|
74
|
+
# to be used by concrete implementations having a get_content class method
|
75
|
+
# that takes a path to a file and return the textual content extracted from
|
76
|
+
# that file.
|
77
|
+
module ExternalAppHelper
|
78
|
+
def process(content)
|
79
|
+
result = {}
|
80
|
+
as_file(content) do |file|
|
81
|
+
result[:content] = get_content(file.path).strip
|
82
|
+
end
|
83
|
+
result
|
84
|
+
end
|
85
|
+
|
86
|
+
def as_file(content)
|
87
|
+
file = Tempfile.new('rdig')
|
88
|
+
file << content
|
89
|
+
file.close
|
90
|
+
yield file
|
91
|
+
file.delete
|
92
|
+
end
|
93
|
+
|
94
|
+
def available
|
95
|
+
if @available.nil?
|
96
|
+
@available = !find_executable(@executable).nil?
|
97
|
+
end
|
98
|
+
@available
|
99
|
+
end
|
100
|
+
|
101
|
+
def can_do(content_type)
|
102
|
+
available and super(content_type)
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
# Extract text from pdf content.
|
107
|
+
#
|
108
|
+
# Requires the pdftotext utility from the xpdf-utils package
|
109
|
+
# (on debian and friends do 'apt-get install xpdf-utils')
|
110
|
+
#
|
111
|
+
# TODO: use pdfinfo to get title from document
|
112
|
+
class PdfContentExtractor < ContentExtractor
|
113
|
+
include ExternalAppHelper
|
114
|
+
|
115
|
+
def initialize
|
116
|
+
@executable = 'pdftotext'
|
117
|
+
@pattern = /^application\/pdf/
|
118
|
+
end
|
119
|
+
|
120
|
+
def get_content(path_to_tempfile)
|
121
|
+
%x{#{@executable} '#{path_to_tempfile}' -}
|
122
|
+
end
|
123
|
+
end
|
124
|
+
|
125
|
+
# Extract text from word documents
|
126
|
+
#
|
127
|
+
# Requires the antiword utility
|
128
|
+
# (on debian and friends do 'apt-get install antiword')
|
129
|
+
class WordContentExtractor < ContentExtractor
|
130
|
+
include ExternalAppHelper
|
131
|
+
|
132
|
+
def initialize
|
133
|
+
@executable = 'wvHtml'
|
134
|
+
@pattern = /^application\/msword/
|
135
|
+
@html_extractor = HtmlContentExtractor.new
|
136
|
+
end
|
137
|
+
|
138
|
+
def process(content)
|
139
|
+
result = {}
|
140
|
+
as_file(content) do |infile|
|
141
|
+
outfile = Tempfile.new('rdig')
|
142
|
+
outfile.close
|
143
|
+
%x{#{@executable} --targetdir='#{File.dirname(outfile.path)}' '#{infile.path}' '#{File.basename(outfile.path)}'}
|
144
|
+
File.open(outfile.path) do |html|
|
145
|
+
result = @html_extractor.process(html.read)
|
146
|
+
end
|
147
|
+
outfile.delete
|
148
|
+
end
|
149
|
+
return result || {}
|
150
|
+
end
|
151
|
+
|
41
152
|
end
|
42
153
|
|
43
154
|
# extracts title, content and links from html documents
|
44
|
-
class HtmlContentExtractor
|
155
|
+
class HtmlContentExtractor < ContentExtractor
|
156
|
+
|
157
|
+
def initialize
|
158
|
+
@pattern = /^(text\/(html|xml)|application\/(xhtml\+xml|xml))/
|
159
|
+
end
|
45
160
|
|
46
161
|
# returns:
|
47
162
|
# { :content => 'extracted clear text',
|
48
163
|
# :meta => { :title => 'Title' },
|
49
164
|
# :links => [array of urls] }
|
50
|
-
def
|
165
|
+
def process(content)
|
51
166
|
result = { }
|
52
167
|
tag_soup = BeautifulSoup.new(content)
|
53
168
|
result[:title] = extract_title(tag_soup)
|
@@ -64,7 +179,7 @@ module RDig
|
|
64
179
|
# - Then, this element is processed by +extract_text+, which will give
|
65
180
|
# all textual content contained in the root element and all it's
|
66
181
|
# children.
|
67
|
-
def
|
182
|
+
def extract_content(tag_soup)
|
68
183
|
content = ''
|
69
184
|
content_element(tag_soup).children { |child|
|
70
185
|
extract_text(child, content)
|
@@ -74,14 +189,14 @@ module RDig
|
|
74
189
|
|
75
190
|
# extracts the href attributes of all a tags, except
|
76
191
|
# internal links like <a href="#top">
|
77
|
-
def
|
192
|
+
def extract_links(tagsoup)
|
78
193
|
tagsoup.find_all('a').map { |link|
|
79
194
|
CGI.unescapeHTML(link['href']) if (link['href'] && link['href'] !~ /^#/)
|
80
195
|
}.compact
|
81
196
|
end
|
82
197
|
|
83
198
|
# Extracts the title from the given html tree
|
84
|
-
def
|
199
|
+
def extract_title(tagsoup)
|
85
200
|
title = ''
|
86
201
|
the_title_tag = title_tag(tagsoup)
|
87
202
|
if the_title_tag.is_a? String
|
@@ -93,7 +208,7 @@ module RDig
|
|
93
208
|
|
94
209
|
# Recursively extracts all text contained in the given element,
|
95
210
|
# and appends it to content.
|
96
|
-
def
|
211
|
+
def extract_text(element, content='')
|
97
212
|
if element.is_a? NavigableString
|
98
213
|
value = strip_comments(element)
|
99
214
|
value.strip!
|
@@ -118,7 +233,7 @@ module RDig
|
|
118
233
|
#
|
119
234
|
# This may return a string, e.g. an attribute value selected from a meta
|
120
235
|
# tag, too.
|
121
|
-
def
|
236
|
+
def title_tag(tagsoup)
|
122
237
|
if RDig.config.content_extraction.html.title_tag_selector
|
123
238
|
RDig.config.content_extraction.html.title_tag_selector.call(tagsoup)
|
124
239
|
else
|
@@ -127,7 +242,7 @@ module RDig
|
|
127
242
|
end
|
128
243
|
|
129
244
|
# Retrieve the root element to extract document content from
|
130
|
-
def
|
245
|
+
def content_element(tagsoup)
|
131
246
|
if RDig.config.content_extraction.html.content_tag_selector
|
132
247
|
RDig.config.content_extraction.html.content_tag_selector.call(tagsoup)
|
133
248
|
else
|
@@ -136,7 +251,7 @@ module RDig
|
|
136
251
|
end
|
137
252
|
|
138
253
|
# Return the given string minus all html comments
|
139
|
-
def
|
254
|
+
def strip_comments(string)
|
140
255
|
string.gsub(Regexp.new('<!--.*?-->', Regexp::MULTILINE, 'u'), '')
|
141
256
|
end
|
142
257
|
end
|
data/rakefile
CHANGED
@@ -94,7 +94,7 @@ Rake::TestTask.new("test_functional") { |t|
|
|
94
94
|
# Generate the RDoc documentation ----------------------------------------
|
95
95
|
|
96
96
|
rd = Rake::RDocTask.new { |rdoc|
|
97
|
-
rdoc.rdoc_dir = '
|
97
|
+
rdoc.rdoc_dir = 'html'
|
98
98
|
rdoc.title = "RDig - Ferret based full text search for web sites"
|
99
99
|
rdoc.options << '--line-numbers' << '--inline-source'
|
100
100
|
rdoc.options << '--main' << 'README'
|
@@ -323,3 +323,8 @@ task :tag => [:prerelease] do
|
|
323
323
|
end
|
324
324
|
end
|
325
325
|
|
326
|
+
# Publish RDocs ------------------------------------------------------
|
327
|
+
desc "Publish the API documentation"
|
328
|
+
task :pdoc => [:rdoc] do
|
329
|
+
Rake::RubyForgePublisher.new(RUBY_FORGE_PROJECT, RUBY_FORGE_USER).upload
|
330
|
+
end
|
Binary file
|
Binary file
|
data/test/test_helper.rb
CHANGED
@@ -3,7 +3,7 @@ class HtmlContentExtractorTest < Test::Unit::TestCase
|
|
3
3
|
include TestHelper
|
4
4
|
|
5
5
|
def setup
|
6
|
-
@extractor = ContentExtractors::HtmlContentExtractor
|
6
|
+
@extractor = ContentExtractors::HtmlContentExtractor.new
|
7
7
|
@nbsp = [160].pack('U') # non breaking space
|
8
8
|
@config_backup = RDig.config.content_extraction.html.clone
|
9
9
|
end
|
@@ -12,8 +12,17 @@ class HtmlContentExtractorTest < Test::Unit::TestCase
|
|
12
12
|
RDig.config.content_extraction.html = @config_backup
|
13
13
|
end
|
14
14
|
|
15
|
+
def test_can_do
|
16
|
+
assert !@extractor.can_do('application/pdf')
|
17
|
+
assert !@extractor.can_do('application/msword')
|
18
|
+
assert @extractor.can_do('text/html')
|
19
|
+
assert @extractor.can_do('text/xml')
|
20
|
+
assert @extractor.can_do('application/xml')
|
21
|
+
assert @extractor.can_do('application/xhtml+xml')
|
22
|
+
end
|
23
|
+
|
15
24
|
def test_simple
|
16
|
-
result =
|
25
|
+
result = ContentExtractors.process(html_doc('simple'), 'text/html')
|
17
26
|
assert_not_nil result
|
18
27
|
assert_equal 'Sample Title', result[:title]
|
19
28
|
assert_not_nil result[:content]
|
@@ -0,0 +1,33 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
class PdfContentExtractorTest < Test::Unit::TestCase
|
3
|
+
include TestHelper
|
4
|
+
|
5
|
+
def setup
|
6
|
+
@ce = ContentExtractors::PdfContentExtractor.new
|
7
|
+
end
|
8
|
+
|
9
|
+
def test_can_do
|
10
|
+
assert @ce.can_do('application/pdf')
|
11
|
+
assert !@ce.can_do('application/msword')
|
12
|
+
end
|
13
|
+
def test_simple_with_ctype
|
14
|
+
result = ContentExtractors.process(pdf_doc('simple'), 'application/pdf')
|
15
|
+
check_content(result)
|
16
|
+
end
|
17
|
+
|
18
|
+
def test_simple
|
19
|
+
result = @ce.process(pdf_doc('simple'))
|
20
|
+
check_content(result)
|
21
|
+
end
|
22
|
+
|
23
|
+
private
|
24
|
+
def check_content(result)
|
25
|
+
assert_not_nil result
|
26
|
+
assert_nil result[:title]
|
27
|
+
assert_nil result[:links]
|
28
|
+
assert_not_nil result[:content]
|
29
|
+
assert_equal 'This is for testing PDF extraction. Another Paragraph.', result[:content]
|
30
|
+
end
|
31
|
+
|
32
|
+
end
|
33
|
+
|
@@ -0,0 +1,34 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
class WordContentExtractorTest < Test::Unit::TestCase
|
3
|
+
include TestHelper
|
4
|
+
|
5
|
+
def setup
|
6
|
+
@ce = ContentExtractors::WordContentExtractor.new
|
7
|
+
end
|
8
|
+
|
9
|
+
def test_can_do
|
10
|
+
assert !@ce.can_do('application/pdf')
|
11
|
+
assert @ce.can_do('application/msword')
|
12
|
+
end
|
13
|
+
def test_simple_with_ctype
|
14
|
+
result = ContentExtractors.process(word_doc('simple'), 'application/msword')
|
15
|
+
check_content(result)
|
16
|
+
end
|
17
|
+
|
18
|
+
def test_simple
|
19
|
+
result = @ce.process(word_doc('simple'))
|
20
|
+
check_content(result)
|
21
|
+
end
|
22
|
+
|
23
|
+
private
|
24
|
+
def check_content(result)
|
25
|
+
assert_not_nil result
|
26
|
+
assert_equal [], result[:links]
|
27
|
+
assert_not_nil result[:title]
|
28
|
+
assert_equal 'Untitled', result[:title]
|
29
|
+
assert_not_nil result[:content]
|
30
|
+
assert_equal 'Test content for Word content extraction. Another paragraph.', result[:content]
|
31
|
+
end
|
32
|
+
|
33
|
+
end
|
34
|
+
|
metadata
CHANGED
@@ -1,102 +1,107 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
|
-
rubygems_version: 0.8.
|
2
|
+
rubygems_version: 0.8.11
|
3
3
|
specification_version: 1
|
4
4
|
name: rdig
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.
|
7
|
-
date: 2006-
|
6
|
+
version: 0.2.0
|
7
|
+
date: 2006-04-19 00:00:00 +02:00
|
8
8
|
summary: Ruby based web site indexing and searching library.
|
9
9
|
require_paths:
|
10
|
-
|
10
|
+
- lib
|
11
11
|
email: jk@jkraemer.net
|
12
12
|
homepage: http://rdig.rubyforge.org/
|
13
13
|
rubyforge_project: rdig
|
14
|
-
description:
|
15
|
-
a site search for web sites or intranets. Internally, Ferret is used for the
|
16
|
-
full text indexing. After creating a config file for your site, the index can
|
17
|
-
be built with a single call to rdig."
|
14
|
+
description: RDig provides an HTTP crawler and content extraction utilities to help building a site search for web sites or intranets. Internally, Ferret is used for the full text indexing. After creating a config file for your site, the index can be built with a single call to rdig.
|
18
15
|
autorequire:
|
19
16
|
default_executable: rdig
|
20
17
|
bindir: bin
|
21
18
|
has_rdoc: true
|
22
19
|
required_ruby_version: !ruby/object:Gem::Version::Requirement
|
23
20
|
requirements:
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
version: 0.0.0
|
21
|
+
- - ">"
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 0.0.0
|
28
24
|
version:
|
29
25
|
platform: ruby
|
26
|
+
signing_key:
|
27
|
+
cert_chain:
|
30
28
|
authors:
|
31
|
-
|
29
|
+
- Jens Kraemer
|
32
30
|
files:
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
31
|
+
- bin/rdig
|
32
|
+
- lib/rdig
|
33
|
+
- lib/htmlentities
|
34
|
+
- lib/rdig.rb
|
35
|
+
- lib/rdig/http_client.rb
|
36
|
+
- lib/rdig/crawler.rb
|
37
|
+
- lib/rdig/search.rb
|
38
|
+
- lib/rdig/highlight.rb
|
39
|
+
- lib/rdig/index.rb
|
40
|
+
- lib/rdig/url_filters.rb
|
41
|
+
- lib/rdig/content_extractors.rb
|
42
|
+
- lib/htmlentities/CHANGES
|
43
|
+
- lib/htmlentities/COPYING
|
44
|
+
- lib/htmlentities/README
|
45
|
+
- lib/htmlentities/htmlentities.rb
|
46
|
+
- test/unit
|
47
|
+
- test/fixtures
|
48
|
+
- test/test_helper.rb
|
49
|
+
- test/unit/etag_filter_test.rb
|
50
|
+
- test/unit/url_filters_test.rb
|
51
|
+
- test/unit/html_content_extractor_test.rb
|
52
|
+
- test/unit/pdf_content_extractor_test.rb
|
53
|
+
- test/unit/word_content_extractor_test.rb
|
54
|
+
- test/fixtures/html
|
55
|
+
- test/fixtures/pdf
|
56
|
+
- test/fixtures/word
|
57
|
+
- test/fixtures/html/entities.html
|
58
|
+
- test/fixtures/html/simple.html
|
59
|
+
- test/fixtures/html/custom_tag_selectors.html
|
60
|
+
- test/fixtures/pdf/simple.pdf
|
61
|
+
- test/fixtures/word/simple.doc
|
62
|
+
- doc/examples
|
63
|
+
- doc/examples/config.rb
|
64
|
+
- LICENSE
|
65
|
+
- TODO
|
66
|
+
- CHANGES
|
67
|
+
- README
|
68
|
+
- install.rb
|
69
|
+
- rakefile
|
66
70
|
test_files: []
|
71
|
+
|
67
72
|
rdoc_options:
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
+
- --title
|
74
|
+
- Rake -- Ruby Make
|
75
|
+
- --main
|
76
|
+
- README
|
77
|
+
- --line-numbers
|
73
78
|
extra_rdoc_files:
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
79
|
+
- README
|
80
|
+
- CHANGES
|
81
|
+
- LICENSE
|
82
|
+
- TODO
|
78
83
|
executables:
|
79
|
-
|
84
|
+
- rdig
|
80
85
|
extensions: []
|
86
|
+
|
81
87
|
requirements: []
|
88
|
+
|
82
89
|
dependencies:
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
version: 1.0.4
|
102
|
-
version:
|
90
|
+
- !ruby/object:Gem::Dependency
|
91
|
+
name: ferret
|
92
|
+
version_requirement:
|
93
|
+
version_requirements: !ruby/object:Gem::Version::Requirement
|
94
|
+
requirements:
|
95
|
+
- - ">="
|
96
|
+
- !ruby/object:Gem::Version
|
97
|
+
version: 0.3.2
|
98
|
+
version:
|
99
|
+
- !ruby/object:Gem::Dependency
|
100
|
+
name: rubyful_soup
|
101
|
+
version_requirement:
|
102
|
+
version_requirements: !ruby/object:Gem::Version::Requirement
|
103
|
+
requirements:
|
104
|
+
- - ">="
|
105
|
+
- !ruby/object:Gem::Version
|
106
|
+
version: 1.0.4
|
107
|
+
version:
|