rdig 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGES +6 -0
- data/bin/rdig +0 -17
- data/lib/rdig.rb +5 -1
- data/lib/rdig/content_extractors.rb +133 -18
- data/rakefile +6 -1
- data/test/fixtures/pdf/simple.pdf +0 -0
- data/test/fixtures/word/simple.doc +0 -0
- data/test/test_helper.rb +6 -0
- data/test/unit/html_content_extractor_test.rb +11 -2
- data/test/unit/pdf_content_extractor_test.rb +33 -0
- data/test/unit/word_content_extractor_test.rb +34 -0
- metadata +81 -76
data/CHANGES
CHANGED
data/bin/rdig
CHANGED
@@ -13,20 +13,3 @@ end
|
|
13
13
|
RDig.application.run
|
14
14
|
|
15
15
|
|
16
|
-
#$LOAD_PATH << File.expand_path(File.dirname(__FILE__) + "/lib")
|
17
|
-
#$LOAD_PATH << File.expand_path(File.dirname(__FILE__))
|
18
|
-
#require 'init'
|
19
|
-
|
20
|
-
#if ARGV[0]
|
21
|
-
# require ARGV[0]
|
22
|
-
#else
|
23
|
-
# require 'config'
|
24
|
-
#end
|
25
|
-
|
26
|
-
#include SiteSearch
|
27
|
-
|
28
|
-
|
29
|
-
#puts "creating new index in #{SiteSearch.settings[:index_dir]}"
|
30
|
-
|
31
|
-
#crawler = Crawler.new
|
32
|
-
#crawler.run
|
data/lib/rdig.rb
CHANGED
@@ -24,7 +24,7 @@
|
|
24
24
|
#++
|
25
25
|
#
|
26
26
|
|
27
|
-
RDIGVERSION = '0.
|
27
|
+
RDIGVERSION = '0.2.0'
|
28
28
|
|
29
29
|
|
30
30
|
require 'thread'
|
@@ -37,6 +37,10 @@ require 'cgi'
|
|
37
37
|
require 'set'
|
38
38
|
require 'net/http'
|
39
39
|
require 'getoptlong'
|
40
|
+
require 'tempfile'
|
41
|
+
# mkmf gives us the handy find_executable method used to check for helper
|
42
|
+
# programs:
|
43
|
+
require 'mkmf'
|
40
44
|
|
41
45
|
begin
|
42
46
|
require 'rubyful_soup'
|
@@ -23,31 +23,146 @@ end
|
|
23
23
|
|
24
24
|
module RDig
|
25
25
|
|
26
|
-
# Contains
|
26
|
+
# Contains classes which are used for extracting content and meta data from
|
27
27
|
# various content types.
|
28
|
-
#
|
29
|
-
# TODO: support at least pdf, too.
|
30
28
|
module ContentExtractors
|
31
29
|
|
32
30
|
# process the given +content+ depending on it's +content_type+.
|
33
|
-
def
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
31
|
+
def self.process(content, content_type)
|
32
|
+
ContentExtractor.process(content, content_type)
|
33
|
+
# case content_type
|
34
|
+
#when /^(text\/(html|xml)|application\/(xhtml\+xml|xml))/
|
35
|
+
# return HtmlContentExtractor.process(content)
|
36
|
+
#when /^application\/.+pdf/
|
37
|
+
# return PdfContentExtractor.process(content) unless RDig.config.content_extraction.pdf.disabled
|
38
|
+
#else
|
39
|
+
# puts "unable to handle content type #{content_type}"
|
40
|
+
#end
|
41
|
+
#return nil
|
42
|
+
end
|
43
|
+
|
44
|
+
# Base class for Content Extractors.
|
45
|
+
# Extractors inheriting from this class will be auto-discovered and used
|
46
|
+
# when can_do returns true
|
47
|
+
class ContentExtractor
|
48
|
+
|
49
|
+
def self.inherited(extractor)
|
50
|
+
super(extractor)
|
51
|
+
puts("discovered content extractor class: #{extractor}")
|
52
|
+
self.extractors << extractor
|
53
|
+
end
|
54
|
+
|
55
|
+
def self.extractors; @@extractors ||= [] end
|
56
|
+
def self.extractor_instances
|
57
|
+
@@extractor_instances ||= extractors.map { |ex_class| ex_class.new }
|
58
|
+
end
|
59
|
+
|
60
|
+
def self.process(content, content_type)
|
61
|
+
self.extractor_instances.each { |extractor|
|
62
|
+
return extractor.process(content) if extractor.can_do(content_type)
|
63
|
+
}
|
38
64
|
puts "unable to handle content type #{content_type}"
|
65
|
+
nil
|
66
|
+
end
|
67
|
+
|
68
|
+
def can_do(content_type)
|
69
|
+
content_type =~ @pattern
|
39
70
|
end
|
40
|
-
|
71
|
+
end
|
72
|
+
|
73
|
+
|
74
|
+
# to be used by concrete implementations having a get_content class method
|
75
|
+
# that takes a path to a file and return the textual content extracted from
|
76
|
+
# that file.
|
77
|
+
module ExternalAppHelper
|
78
|
+
def process(content)
|
79
|
+
result = {}
|
80
|
+
as_file(content) do |file|
|
81
|
+
result[:content] = get_content(file.path).strip
|
82
|
+
end
|
83
|
+
result
|
84
|
+
end
|
85
|
+
|
86
|
+
def as_file(content)
|
87
|
+
file = Tempfile.new('rdig')
|
88
|
+
file << content
|
89
|
+
file.close
|
90
|
+
yield file
|
91
|
+
file.delete
|
92
|
+
end
|
93
|
+
|
94
|
+
def available
|
95
|
+
if @available.nil?
|
96
|
+
@available = !find_executable(@executable).nil?
|
97
|
+
end
|
98
|
+
@available
|
99
|
+
end
|
100
|
+
|
101
|
+
def can_do(content_type)
|
102
|
+
available and super(content_type)
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
# Extract text from pdf content.
|
107
|
+
#
|
108
|
+
# Requires the pdftotext utility from the xpdf-utils package
|
109
|
+
# (on debian and friends do 'apt-get install xpdf-utils')
|
110
|
+
#
|
111
|
+
# TODO: use pdfinfo to get title from document
|
112
|
+
class PdfContentExtractor < ContentExtractor
|
113
|
+
include ExternalAppHelper
|
114
|
+
|
115
|
+
def initialize
|
116
|
+
@executable = 'pdftotext'
|
117
|
+
@pattern = /^application\/pdf/
|
118
|
+
end
|
119
|
+
|
120
|
+
def get_content(path_to_tempfile)
|
121
|
+
%x{#{@executable} '#{path_to_tempfile}' -}
|
122
|
+
end
|
123
|
+
end
|
124
|
+
|
125
|
+
# Extract text from word documents
|
126
|
+
#
|
127
|
+
# Requires the antiword utility
|
128
|
+
# (on debian and friends do 'apt-get install antiword')
|
129
|
+
class WordContentExtractor < ContentExtractor
|
130
|
+
include ExternalAppHelper
|
131
|
+
|
132
|
+
def initialize
|
133
|
+
@executable = 'wvHtml'
|
134
|
+
@pattern = /^application\/msword/
|
135
|
+
@html_extractor = HtmlContentExtractor.new
|
136
|
+
end
|
137
|
+
|
138
|
+
def process(content)
|
139
|
+
result = {}
|
140
|
+
as_file(content) do |infile|
|
141
|
+
outfile = Tempfile.new('rdig')
|
142
|
+
outfile.close
|
143
|
+
%x{#{@executable} --targetdir='#{File.dirname(outfile.path)}' '#{infile.path}' '#{File.basename(outfile.path)}'}
|
144
|
+
File.open(outfile.path) do |html|
|
145
|
+
result = @html_extractor.process(html.read)
|
146
|
+
end
|
147
|
+
outfile.delete
|
148
|
+
end
|
149
|
+
return result || {}
|
150
|
+
end
|
151
|
+
|
41
152
|
end
|
42
153
|
|
43
154
|
# extracts title, content and links from html documents
|
44
|
-
class HtmlContentExtractor
|
155
|
+
class HtmlContentExtractor < ContentExtractor
|
156
|
+
|
157
|
+
def initialize
|
158
|
+
@pattern = /^(text\/(html|xml)|application\/(xhtml\+xml|xml))/
|
159
|
+
end
|
45
160
|
|
46
161
|
# returns:
|
47
162
|
# { :content => 'extracted clear text',
|
48
163
|
# :meta => { :title => 'Title' },
|
49
164
|
# :links => [array of urls] }
|
50
|
-
def
|
165
|
+
def process(content)
|
51
166
|
result = { }
|
52
167
|
tag_soup = BeautifulSoup.new(content)
|
53
168
|
result[:title] = extract_title(tag_soup)
|
@@ -64,7 +179,7 @@ module RDig
|
|
64
179
|
# - Then, this element is processed by +extract_text+, which will give
|
65
180
|
# all textual content contained in the root element and all it's
|
66
181
|
# children.
|
67
|
-
def
|
182
|
+
def extract_content(tag_soup)
|
68
183
|
content = ''
|
69
184
|
content_element(tag_soup).children { |child|
|
70
185
|
extract_text(child, content)
|
@@ -74,14 +189,14 @@ module RDig
|
|
74
189
|
|
75
190
|
# extracts the href attributes of all a tags, except
|
76
191
|
# internal links like <a href="#top">
|
77
|
-
def
|
192
|
+
def extract_links(tagsoup)
|
78
193
|
tagsoup.find_all('a').map { |link|
|
79
194
|
CGI.unescapeHTML(link['href']) if (link['href'] && link['href'] !~ /^#/)
|
80
195
|
}.compact
|
81
196
|
end
|
82
197
|
|
83
198
|
# Extracts the title from the given html tree
|
84
|
-
def
|
199
|
+
def extract_title(tagsoup)
|
85
200
|
title = ''
|
86
201
|
the_title_tag = title_tag(tagsoup)
|
87
202
|
if the_title_tag.is_a? String
|
@@ -93,7 +208,7 @@ module RDig
|
|
93
208
|
|
94
209
|
# Recursively extracts all text contained in the given element,
|
95
210
|
# and appends it to content.
|
96
|
-
def
|
211
|
+
def extract_text(element, content='')
|
97
212
|
if element.is_a? NavigableString
|
98
213
|
value = strip_comments(element)
|
99
214
|
value.strip!
|
@@ -118,7 +233,7 @@ module RDig
|
|
118
233
|
#
|
119
234
|
# This may return a string, e.g. an attribute value selected from a meta
|
120
235
|
# tag, too.
|
121
|
-
def
|
236
|
+
def title_tag(tagsoup)
|
122
237
|
if RDig.config.content_extraction.html.title_tag_selector
|
123
238
|
RDig.config.content_extraction.html.title_tag_selector.call(tagsoup)
|
124
239
|
else
|
@@ -127,7 +242,7 @@ module RDig
|
|
127
242
|
end
|
128
243
|
|
129
244
|
# Retrieve the root element to extract document content from
|
130
|
-
def
|
245
|
+
def content_element(tagsoup)
|
131
246
|
if RDig.config.content_extraction.html.content_tag_selector
|
132
247
|
RDig.config.content_extraction.html.content_tag_selector.call(tagsoup)
|
133
248
|
else
|
@@ -136,7 +251,7 @@ module RDig
|
|
136
251
|
end
|
137
252
|
|
138
253
|
# Return the given string minus all html comments
|
139
|
-
def
|
254
|
+
def strip_comments(string)
|
140
255
|
string.gsub(Regexp.new('<!--.*?-->', Regexp::MULTILINE, 'u'), '')
|
141
256
|
end
|
142
257
|
end
|
data/rakefile
CHANGED
@@ -94,7 +94,7 @@ Rake::TestTask.new("test_functional") { |t|
|
|
94
94
|
# Generate the RDoc documentation ----------------------------------------
|
95
95
|
|
96
96
|
rd = Rake::RDocTask.new { |rdoc|
|
97
|
-
rdoc.rdoc_dir = '
|
97
|
+
rdoc.rdoc_dir = 'html'
|
98
98
|
rdoc.title = "RDig - Ferret based full text search for web sites"
|
99
99
|
rdoc.options << '--line-numbers' << '--inline-source'
|
100
100
|
rdoc.options << '--main' << 'README'
|
@@ -323,3 +323,8 @@ task :tag => [:prerelease] do
|
|
323
323
|
end
|
324
324
|
end
|
325
325
|
|
326
|
+
# Publish RDocs ------------------------------------------------------
|
327
|
+
desc "Publish the API documentation"
|
328
|
+
task :pdoc => [:rdoc] do
|
329
|
+
Rake::RubyForgePublisher.new(RUBY_FORGE_PROJECT, RUBY_FORGE_USER).upload
|
330
|
+
end
|
Binary file
|
Binary file
|
data/test/test_helper.rb
CHANGED
@@ -3,7 +3,7 @@ class HtmlContentExtractorTest < Test::Unit::TestCase
|
|
3
3
|
include TestHelper
|
4
4
|
|
5
5
|
def setup
|
6
|
-
@extractor = ContentExtractors::HtmlContentExtractor
|
6
|
+
@extractor = ContentExtractors::HtmlContentExtractor.new
|
7
7
|
@nbsp = [160].pack('U') # non breaking space
|
8
8
|
@config_backup = RDig.config.content_extraction.html.clone
|
9
9
|
end
|
@@ -12,8 +12,17 @@ class HtmlContentExtractorTest < Test::Unit::TestCase
|
|
12
12
|
RDig.config.content_extraction.html = @config_backup
|
13
13
|
end
|
14
14
|
|
15
|
+
def test_can_do
|
16
|
+
assert !@extractor.can_do('application/pdf')
|
17
|
+
assert !@extractor.can_do('application/msword')
|
18
|
+
assert @extractor.can_do('text/html')
|
19
|
+
assert @extractor.can_do('text/xml')
|
20
|
+
assert @extractor.can_do('application/xml')
|
21
|
+
assert @extractor.can_do('application/xhtml+xml')
|
22
|
+
end
|
23
|
+
|
15
24
|
def test_simple
|
16
|
-
result =
|
25
|
+
result = ContentExtractors.process(html_doc('simple'), 'text/html')
|
17
26
|
assert_not_nil result
|
18
27
|
assert_equal 'Sample Title', result[:title]
|
19
28
|
assert_not_nil result[:content]
|
@@ -0,0 +1,33 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
class PdfContentExtractorTest < Test::Unit::TestCase
|
3
|
+
include TestHelper
|
4
|
+
|
5
|
+
def setup
|
6
|
+
@ce = ContentExtractors::PdfContentExtractor.new
|
7
|
+
end
|
8
|
+
|
9
|
+
def test_can_do
|
10
|
+
assert @ce.can_do('application/pdf')
|
11
|
+
assert !@ce.can_do('application/msword')
|
12
|
+
end
|
13
|
+
def test_simple_with_ctype
|
14
|
+
result = ContentExtractors.process(pdf_doc('simple'), 'application/pdf')
|
15
|
+
check_content(result)
|
16
|
+
end
|
17
|
+
|
18
|
+
def test_simple
|
19
|
+
result = @ce.process(pdf_doc('simple'))
|
20
|
+
check_content(result)
|
21
|
+
end
|
22
|
+
|
23
|
+
private
|
24
|
+
def check_content(result)
|
25
|
+
assert_not_nil result
|
26
|
+
assert_nil result[:title]
|
27
|
+
assert_nil result[:links]
|
28
|
+
assert_not_nil result[:content]
|
29
|
+
assert_equal 'This is for testing PDF extraction. Another Paragraph.', result[:content]
|
30
|
+
end
|
31
|
+
|
32
|
+
end
|
33
|
+
|
@@ -0,0 +1,34 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
class WordContentExtractorTest < Test::Unit::TestCase
|
3
|
+
include TestHelper
|
4
|
+
|
5
|
+
def setup
|
6
|
+
@ce = ContentExtractors::WordContentExtractor.new
|
7
|
+
end
|
8
|
+
|
9
|
+
def test_can_do
|
10
|
+
assert !@ce.can_do('application/pdf')
|
11
|
+
assert @ce.can_do('application/msword')
|
12
|
+
end
|
13
|
+
def test_simple_with_ctype
|
14
|
+
result = ContentExtractors.process(word_doc('simple'), 'application/msword')
|
15
|
+
check_content(result)
|
16
|
+
end
|
17
|
+
|
18
|
+
def test_simple
|
19
|
+
result = @ce.process(word_doc('simple'))
|
20
|
+
check_content(result)
|
21
|
+
end
|
22
|
+
|
23
|
+
private
|
24
|
+
def check_content(result)
|
25
|
+
assert_not_nil result
|
26
|
+
assert_equal [], result[:links]
|
27
|
+
assert_not_nil result[:title]
|
28
|
+
assert_equal 'Untitled', result[:title]
|
29
|
+
assert_not_nil result[:content]
|
30
|
+
assert_equal 'Test content for Word content extraction. Another paragraph.', result[:content]
|
31
|
+
end
|
32
|
+
|
33
|
+
end
|
34
|
+
|
metadata
CHANGED
@@ -1,102 +1,107 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
|
-
rubygems_version: 0.8.
|
2
|
+
rubygems_version: 0.8.11
|
3
3
|
specification_version: 1
|
4
4
|
name: rdig
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.
|
7
|
-
date: 2006-
|
6
|
+
version: 0.2.0
|
7
|
+
date: 2006-04-19 00:00:00 +02:00
|
8
8
|
summary: Ruby based web site indexing and searching library.
|
9
9
|
require_paths:
|
10
|
-
|
10
|
+
- lib
|
11
11
|
email: jk@jkraemer.net
|
12
12
|
homepage: http://rdig.rubyforge.org/
|
13
13
|
rubyforge_project: rdig
|
14
|
-
description:
|
15
|
-
a site search for web sites or intranets. Internally, Ferret is used for the
|
16
|
-
full text indexing. After creating a config file for your site, the index can
|
17
|
-
be built with a single call to rdig."
|
14
|
+
description: RDig provides an HTTP crawler and content extraction utilities to help building a site search for web sites or intranets. Internally, Ferret is used for the full text indexing. After creating a config file for your site, the index can be built with a single call to rdig.
|
18
15
|
autorequire:
|
19
16
|
default_executable: rdig
|
20
17
|
bindir: bin
|
21
18
|
has_rdoc: true
|
22
19
|
required_ruby_version: !ruby/object:Gem::Version::Requirement
|
23
20
|
requirements:
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
version: 0.0.0
|
21
|
+
- - ">"
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 0.0.0
|
28
24
|
version:
|
29
25
|
platform: ruby
|
26
|
+
signing_key:
|
27
|
+
cert_chain:
|
30
28
|
authors:
|
31
|
-
|
29
|
+
- Jens Kraemer
|
32
30
|
files:
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
31
|
+
- bin/rdig
|
32
|
+
- lib/rdig
|
33
|
+
- lib/htmlentities
|
34
|
+
- lib/rdig.rb
|
35
|
+
- lib/rdig/http_client.rb
|
36
|
+
- lib/rdig/crawler.rb
|
37
|
+
- lib/rdig/search.rb
|
38
|
+
- lib/rdig/highlight.rb
|
39
|
+
- lib/rdig/index.rb
|
40
|
+
- lib/rdig/url_filters.rb
|
41
|
+
- lib/rdig/content_extractors.rb
|
42
|
+
- lib/htmlentities/CHANGES
|
43
|
+
- lib/htmlentities/COPYING
|
44
|
+
- lib/htmlentities/README
|
45
|
+
- lib/htmlentities/htmlentities.rb
|
46
|
+
- test/unit
|
47
|
+
- test/fixtures
|
48
|
+
- test/test_helper.rb
|
49
|
+
- test/unit/etag_filter_test.rb
|
50
|
+
- test/unit/url_filters_test.rb
|
51
|
+
- test/unit/html_content_extractor_test.rb
|
52
|
+
- test/unit/pdf_content_extractor_test.rb
|
53
|
+
- test/unit/word_content_extractor_test.rb
|
54
|
+
- test/fixtures/html
|
55
|
+
- test/fixtures/pdf
|
56
|
+
- test/fixtures/word
|
57
|
+
- test/fixtures/html/entities.html
|
58
|
+
- test/fixtures/html/simple.html
|
59
|
+
- test/fixtures/html/custom_tag_selectors.html
|
60
|
+
- test/fixtures/pdf/simple.pdf
|
61
|
+
- test/fixtures/word/simple.doc
|
62
|
+
- doc/examples
|
63
|
+
- doc/examples/config.rb
|
64
|
+
- LICENSE
|
65
|
+
- TODO
|
66
|
+
- CHANGES
|
67
|
+
- README
|
68
|
+
- install.rb
|
69
|
+
- rakefile
|
66
70
|
test_files: []
|
71
|
+
|
67
72
|
rdoc_options:
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
+
- --title
|
74
|
+
- Rake -- Ruby Make
|
75
|
+
- --main
|
76
|
+
- README
|
77
|
+
- --line-numbers
|
73
78
|
extra_rdoc_files:
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
79
|
+
- README
|
80
|
+
- CHANGES
|
81
|
+
- LICENSE
|
82
|
+
- TODO
|
78
83
|
executables:
|
79
|
-
|
84
|
+
- rdig
|
80
85
|
extensions: []
|
86
|
+
|
81
87
|
requirements: []
|
88
|
+
|
82
89
|
dependencies:
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
version: 1.0.4
|
102
|
-
version:
|
90
|
+
- !ruby/object:Gem::Dependency
|
91
|
+
name: ferret
|
92
|
+
version_requirement:
|
93
|
+
version_requirements: !ruby/object:Gem::Version::Requirement
|
94
|
+
requirements:
|
95
|
+
- - ">="
|
96
|
+
- !ruby/object:Gem::Version
|
97
|
+
version: 0.3.2
|
98
|
+
version:
|
99
|
+
- !ruby/object:Gem::Dependency
|
100
|
+
name: rubyful_soup
|
101
|
+
version_requirement:
|
102
|
+
version_requirements: !ruby/object:Gem::Version::Requirement
|
103
|
+
requirements:
|
104
|
+
- - ">="
|
105
|
+
- !ruby/object:Gem::Version
|
106
|
+
version: 1.0.4
|
107
|
+
version:
|