rdig 0.3.5 → 0.3.8
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGES +6 -1
- data/History.txt +11 -0
- data/Manifest.txt +39 -0
- data/doc/examples/config.rb +10 -0
- data/lib/rdig.rb +12 -8
- data/lib/rdig/content_extractors.rb +7 -1
- data/lib/rdig/content_extractors/doc.rb +5 -19
- data/lib/rdig/content_extractors/hpricot.rb +13 -9
- data/lib/rdig/crawler.rb +18 -10
- data/lib/rdig/documents.rb +13 -9
- data/lib/rdig/url_filters.rb +15 -8
- data/rakefile +2 -1
- data/test/fixtures/html/frameset.html +13 -0
- data/test/fixtures/html/imagemap.html +13 -0
- data/test/unit/hpricot_content_extractor_test.rb +12 -0
- data/test/unit/searcher_test.rb +2 -2
- data/test/unit/url_filters_test.rb +1 -1
- metadata +108 -62
- data/TODO +0 -0
- data/lib/htmlentities/CHANGES +0 -21
- data/lib/htmlentities/COPYING +0 -7
- data/lib/htmlentities/README +0 -15
- data/lib/htmlentities/htmlentities.rb +0 -281
data/CHANGES
CHANGED
@@ -1,8 +1,13 @@
|
|
1
|
+
0.3.6
|
2
|
+
- remove bundled htmlentities in favor of a gem dependency
|
3
|
+
- also extract links from area and frame tags
|
4
|
+
- fix etagfilter bug
|
5
|
+
|
1
6
|
0.3.5
|
2
7
|
- Add max_depth option to crawler configuration for limiting the crawl to a
|
3
8
|
specific depth
|
4
9
|
- add support for http proxies including basic authentication
|
5
|
-
- remove
|
10
|
+
- remove rubyful_soup support
|
6
11
|
|
7
12
|
0.3.4
|
8
13
|
|
data/History.txt
ADDED
@@ -0,0 +1,11 @@
|
|
1
|
+
== 0.3.8 2009-04-26
|
2
|
+
|
3
|
+
* bump up version
|
4
|
+
|
5
|
+
== 0.3.7 2009-04-26
|
6
|
+
|
7
|
+
* Gem spec for automatic gem building on github
|
8
|
+
* doc enhancements
|
9
|
+
* better uri-normalization, re-add result uri of redirection
|
10
|
+
into the queue instea of directly indexing the resulting
|
11
|
+
page
|
data/Manifest.txt
ADDED
@@ -0,0 +1,39 @@
|
|
1
|
+
CHANGES
|
2
|
+
History.txt
|
3
|
+
install.rb
|
4
|
+
LICENSE
|
5
|
+
Manifest.txt
|
6
|
+
rakefile
|
7
|
+
README
|
8
|
+
bin/rdig
|
9
|
+
doc/examples/config.rb
|
10
|
+
lib/rdig/content_extractors/doc.rb
|
11
|
+
lib/rdig/content_extractors/hpricot.rb
|
12
|
+
lib/rdig/content_extractors/pdf.rb
|
13
|
+
lib/rdig/content_extractors.rb
|
14
|
+
lib/rdig/crawler.rb
|
15
|
+
lib/rdig/documents.rb
|
16
|
+
lib/rdig/file.rb
|
17
|
+
lib/rdig/highlight.rb
|
18
|
+
lib/rdig/index.rb
|
19
|
+
lib/rdig/search.rb
|
20
|
+
lib/rdig/url_filters.rb
|
21
|
+
lib/rdig.rb
|
22
|
+
test/fixtures/html/custom_tag_selectors.html
|
23
|
+
test/fixtures/html/entities.html
|
24
|
+
test/fixtures/html/frameset.html
|
25
|
+
test/fixtures/html/imagemap.html
|
26
|
+
test/fixtures/html/simple.html
|
27
|
+
test/fixtures/pdf/simple.pdf
|
28
|
+
test/fixtures/word/simple.doc
|
29
|
+
test/test_helper.rb
|
30
|
+
test/unit/crawler_fs_test.rb
|
31
|
+
test/unit/etag_filter_test.rb
|
32
|
+
test/unit/file_document_test.rb
|
33
|
+
test/unit/hpricot_content_extractor_test.rb
|
34
|
+
test/unit/http_document_test.rb
|
35
|
+
test/unit/pdf_content_extractor_test.rb
|
36
|
+
test/unit/rdig_test.rb
|
37
|
+
test/unit/searcher_test.rb
|
38
|
+
test/unit/url_filters_test.rb
|
39
|
+
test/unit/word_content_extractor_test.rb
|
data/doc/examples/config.rb
CHANGED
@@ -86,6 +86,13 @@ RDig.configuration do |cfg|
|
|
86
86
|
# Set to 0 to only index the start_urls.
|
87
87
|
# cfg.crawler.max_depth = nil
|
88
88
|
|
89
|
+
# default index document to be appended to URIs ending with a trailing '/'
|
90
|
+
# cfg.crawler.normalize_uri.index_document = nil
|
91
|
+
# strip trailing '/' from URIs to avoid double indexing of pages referred by '
|
92
|
+
# Ignored if index_document is set.
|
93
|
+
# Not necessary when the server issues proper etags since the default etag filter will kill these doublettes.
|
94
|
+
# cfg.crawler.normalize_uri.remove_trailing_slash = nil
|
95
|
+
|
89
96
|
# http proxy configuration
|
90
97
|
# proxy url
|
91
98
|
# cfg.crawler.http_proxy = nil
|
@@ -94,6 +101,9 @@ RDig.configuration do |cfg|
|
|
94
101
|
# cfg.crawler.http_proxy_user = nil
|
95
102
|
# proxy password
|
96
103
|
# cfg.crawler.http_proxy_pass = nil
|
104
|
+
#
|
105
|
+
# to use basic auth without a proxy, use this syntax:
|
106
|
+
# cfg.crawler.open_uri_http_options = { :http_basic_authentication => [user, password] }
|
97
107
|
|
98
108
|
# indexer options
|
99
109
|
|
data/lib/rdig.rb
CHANGED
@@ -24,7 +24,7 @@
|
|
24
24
|
#++
|
25
25
|
#
|
26
26
|
|
27
|
-
RDIGVERSION = '0.3.
|
27
|
+
RDIGVERSION = '0.3.8'
|
28
28
|
|
29
29
|
|
30
30
|
require 'thread'
|
@@ -49,7 +49,8 @@ rescue LoadError
|
|
49
49
|
require 'ferret'
|
50
50
|
end
|
51
51
|
|
52
|
-
|
52
|
+
|
53
|
+
#require 'htmlentities/htmlentities'
|
53
54
|
|
54
55
|
|
55
56
|
$KCODE = 'u'
|
@@ -60,17 +61,16 @@ module RDig
|
|
60
61
|
|
61
62
|
class << self
|
62
63
|
|
63
|
-
#
|
64
|
-
#
|
65
|
-
#
|
66
|
-
# a document has to survive all filters in the chain to get indexed.
|
64
|
+
# Filter chains are used by the crawler to limit the set of documents being indexed.
|
65
|
+
# There are two chains - one for http, and one for file system crawling.
|
66
|
+
# Each document has to survive all filters in the relevant chain to get indexed.
|
67
67
|
def filter_chain
|
68
68
|
@filter_chain ||= {
|
69
69
|
# filter chain for http crawling
|
70
70
|
:http => [
|
71
71
|
:scheme_filter_http,
|
72
72
|
:fix_relative_uri,
|
73
|
-
:normalize_uri,
|
73
|
+
{ :normalize_uri => :normalize_uri },
|
74
74
|
{ RDig::UrlFilters::DepthFilter => :max_depth },
|
75
75
|
{ :hostname_filter => :include_hosts },
|
76
76
|
{ RDig::UrlFilters::UrlInclusionFilter => :include_documents },
|
@@ -120,7 +120,11 @@ module RDig
|
|
120
120
|
:wait_before_leave => 10,
|
121
121
|
:http_proxy => nil,
|
122
122
|
:http_proxy_user => nil,
|
123
|
-
:http_proxy_pass => nil
|
123
|
+
:http_proxy_pass => nil,
|
124
|
+
:normalize_uri => OpenStruct.new(
|
125
|
+
:index_document => nil,
|
126
|
+
:remove_trailing_slash => nil
|
127
|
+
)
|
124
128
|
),
|
125
129
|
:content_extraction => OpenStruct.new(
|
126
130
|
# settings for html content extraction (hpricot)
|
@@ -23,7 +23,13 @@ module RDig
|
|
23
23
|
def self.extractor_instances
|
24
24
|
@@extractor_instances ||= extractors.map { |ex_class|
|
25
25
|
RDig.logger.info "initializing content extractor: #{ex_class}"
|
26
|
-
|
26
|
+
ex = nil
|
27
|
+
begin
|
28
|
+
ex = ex_class.new(RDig.configuration.content_extraction)
|
29
|
+
rescue Exception
|
30
|
+
RDig.logger.error "error: #{$!.message}\n#{$!.backtrace.join("\n")}"
|
31
|
+
end
|
32
|
+
ex
|
27
33
|
}.compact
|
28
34
|
end
|
29
35
|
|
@@ -13,25 +13,11 @@ module RDig
|
|
13
13
|
@wvhtml = 'wvHtml'
|
14
14
|
@pattern = /^application\/msword/
|
15
15
|
# html extractor for parsing wvHtml output
|
16
|
-
|
17
|
-
|
18
|
-
:
|
19
|
-
|
20
|
-
|
21
|
-
)))
|
22
|
-
elsif defined?(RubyfulSoupContentExtractor)
|
23
|
-
@html_extractor = RubyfulSoupContentExtractor.new(OpenStruct.new(
|
24
|
-
:rubyful_soup => OpenStruct.new(
|
25
|
-
:content_tag_selector => lambda { |tagsoup|
|
26
|
-
tagsoup.html.body
|
27
|
-
},
|
28
|
-
:title_tag_selector => lambda { |tagsoup|
|
29
|
-
tagsoup.html.head.title
|
30
|
-
}
|
31
|
-
)))
|
32
|
-
else
|
33
|
-
raise "need at least one html content extractor - please install hpricot or rubyful_soup"
|
34
|
-
end
|
16
|
+
@html_extractor = HpricotContentExtractor.new(OpenStruct.new(
|
17
|
+
:hpricot => OpenStruct.new(
|
18
|
+
:content_tag_selector => 'body',
|
19
|
+
:title_tag_selector => 'title'
|
20
|
+
)))
|
35
21
|
# TODO: better: if $?.exitstatus == 127 (not found)
|
36
22
|
@available = %x{#{@wvhtml} -h 2>&1} =~ /Dom Lachowicz/
|
37
23
|
end
|
@@ -1,11 +1,12 @@
|
|
1
1
|
begin
|
2
2
|
require 'hpricot'
|
3
|
+
require 'htmlentities'
|
3
4
|
rescue LoadError
|
4
5
|
require 'rubygems'
|
5
6
|
require 'hpricot'
|
7
|
+
require 'htmlentities'
|
6
8
|
end
|
7
9
|
|
8
|
-
if defined?(Hpricot)
|
9
10
|
module RDig
|
10
11
|
module ContentExtractors
|
11
12
|
|
@@ -23,11 +24,12 @@ module RDig
|
|
23
24
|
# :title => 'Title',
|
24
25
|
# :links => [array of urls] }
|
25
26
|
def process(content)
|
27
|
+
entities = HTMLEntities.new
|
26
28
|
doc = Hpricot(content)
|
27
29
|
{
|
28
|
-
:title => extract_title(doc).
|
30
|
+
:title => entities.decode(extract_title(doc)).strip,
|
29
31
|
:links => extract_links(doc),
|
30
|
-
:content => extract_content(doc)
|
32
|
+
:content => entities.decode(extract_content(doc))
|
31
33
|
}
|
32
34
|
end
|
33
35
|
|
@@ -50,12 +52,14 @@ module RDig
|
|
50
52
|
# extracts the href attributes of all a tags, except
|
51
53
|
# internal links like <a href="#top">
|
52
54
|
def extract_links(doc)
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
55
|
+
{'a' => 'href', 'area' => 'href', 'frame' => 'src'}.map do |tag, attr|
|
56
|
+
(doc/tag).map do |tag|
|
57
|
+
value = tag[attr]
|
58
|
+
CGI.unescapeHTML(value) if value && value !~ /^#/
|
59
|
+
end
|
60
|
+
end.flatten.compact
|
57
61
|
end
|
58
|
-
|
62
|
+
|
59
63
|
# Extracts the title from the given html tree
|
60
64
|
def extract_title(doc)
|
61
65
|
the_title_tag = title_tag(doc)
|
@@ -85,6 +89,7 @@ module RDig
|
|
85
89
|
def strip_comments(string)
|
86
90
|
string.gsub Regexp.new('<!--.*?-->', Regexp::MULTILINE, 'u'), ''
|
87
91
|
end
|
92
|
+
|
88
93
|
def strip_tags(string)
|
89
94
|
string.gsub! Regexp.new('<(script|style).*?>.*?<\/(script|style).*?>',
|
90
95
|
Regexp::MULTILINE, 'u'), ''
|
@@ -98,4 +103,3 @@ module RDig
|
|
98
103
|
|
99
104
|
end
|
100
105
|
end
|
101
|
-
end
|
data/lib/rdig/crawler.rb
CHANGED
@@ -5,7 +5,6 @@ module RDig
|
|
5
5
|
|
6
6
|
def initialize(config = RDig.config, logger = RDig.logger)
|
7
7
|
@documents = Queue.new
|
8
|
-
@etag_filter = ETagFilter.new
|
9
8
|
@logger = logger
|
10
9
|
@config = config
|
11
10
|
end
|
@@ -22,7 +21,8 @@ module RDig
|
|
22
21
|
# check whether we are indexing on-disk or via http
|
23
22
|
url_type = @config.crawler.start_urls.first =~ /^file:\/\// ? :file : :http
|
24
23
|
chain_config = RDig.filter_chain[url_type]
|
25
|
-
|
24
|
+
|
25
|
+
@etag_filter = ETagFilter.new
|
26
26
|
filterchain = UrlFilters::FilterChain.new(chain_config)
|
27
27
|
@config.crawler.start_urls.each { |url| add_url(url, filterchain) }
|
28
28
|
|
@@ -52,13 +52,21 @@ module RDig
|
|
52
52
|
def process_document(doc, filterchain)
|
53
53
|
@logger.debug "processing document #{doc}"
|
54
54
|
doc.fetch
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
55
|
+
case doc.status
|
56
|
+
when :success
|
57
|
+
if @etag_filter.apply(doc)
|
58
|
+
# add links from this document to the queue
|
59
|
+
doc.content[:links].each { |url|
|
60
|
+
add_url(url, filterchain, doc)
|
61
|
+
} unless doc.content[:links].nil?
|
62
|
+
add_to_index doc
|
63
|
+
end
|
64
|
+
when :redirect
|
65
|
+
@logger.debug "redirect to #{doc.content}"
|
66
|
+
add_url(doc.content, filterchain, doc)
|
67
|
+
else
|
68
|
+
@logger.error "unknown doc status #{doc.status}: #{doc}"
|
69
|
+
end
|
62
70
|
rescue
|
63
71
|
@logger.error "error processing document #{doc.uri.to_s}: #{$!}"
|
64
72
|
@logger.debug "Trace: #{$!.backtrace.join("\n")}"
|
@@ -110,7 +118,7 @@ module RDig
|
|
110
118
|
end
|
111
119
|
|
112
120
|
def apply(document)
|
113
|
-
return document unless (document.respond_to?(:etag) && document.etag)
|
121
|
+
return document unless (document.respond_to?(:etag) && document.etag && !document.etag.empty?)
|
114
122
|
synchronize do
|
115
123
|
@etags.add?(document.etag) ? document : nil
|
116
124
|
end
|
data/lib/rdig/documents.rb
CHANGED
@@ -118,16 +118,20 @@ module RDig
|
|
118
118
|
def fetch
|
119
119
|
RDig.logger.debug "fetching #{@uri.to_s}"
|
120
120
|
open(@uri.to_s, RDig::open_uri_http_options) do |doc|
|
121
|
-
|
122
|
-
|
123
|
-
@
|
124
|
-
# puts "etag: #{@etag}"
|
125
|
-
@content = ContentExtractors.process(doc.read, doc.content_type)
|
126
|
-
@status = :success
|
127
|
-
when 404
|
128
|
-
RDig.logger.info "got 404 for #{@uri}"
|
121
|
+
if @uri.to_s != doc.base_uri.to_s
|
122
|
+
@status = :redirect
|
123
|
+
@content = doc.base_uri
|
129
124
|
else
|
130
|
-
|
125
|
+
case doc.status.first.to_i
|
126
|
+
when 200
|
127
|
+
@etag = doc.meta['etag']
|
128
|
+
@content = ContentExtractors.process(doc.read, doc.content_type)
|
129
|
+
@status = :success
|
130
|
+
when 404
|
131
|
+
RDig.logger.info "got 404 for #{@uri}"
|
132
|
+
else
|
133
|
+
RDig.logger.info "don't know what to do with response: #{doc.status.join(' : ')}"
|
134
|
+
end
|
131
135
|
end
|
132
136
|
end
|
133
137
|
rescue
|
data/lib/rdig/url_filters.rb
CHANGED
@@ -22,7 +22,7 @@ module RDig
|
|
22
22
|
end
|
23
23
|
|
24
24
|
# add a filter and it's args to the chain
|
25
|
-
#
|
25
|
+
# if args is a symbol, it is treated as a configuration key
|
26
26
|
def add(filter, args=nil)
|
27
27
|
args = RDig.config.crawler.send(args) if args.is_a? Symbol
|
28
28
|
case filter
|
@@ -163,7 +163,7 @@ module RDig
|
|
163
163
|
return document
|
164
164
|
end
|
165
165
|
|
166
|
-
# expands
|
166
|
+
# expands href="/path/xyz.html", href="affe.html" and href="../lala.html"
|
167
167
|
# to full urls
|
168
168
|
def UrlFilters.fix_relative_uri(document)
|
169
169
|
#return nil unless document.uri.scheme.nil? || document.uri.scheme =~ /^https?/i
|
@@ -175,11 +175,13 @@ module RDig
|
|
175
175
|
uri.port = ref.port unless uri.port || ref.port==ref.default_port
|
176
176
|
uri.path = ref.path unless uri.path
|
177
177
|
|
178
|
-
|
178
|
+
old_uri_path = uri.path
|
179
|
+
if uri.path !~ /^\// || uri.path =~ /^\.\./
|
179
180
|
ref_path = ref.path || '/'
|
180
181
|
ref_path << '/' if ref_path.empty?
|
181
182
|
uri.path = ref_path[0..ref_path.rindex('/')] + uri.path
|
182
|
-
end
|
183
|
+
end
|
184
|
+
uri.path = uri.path.sub( /\/[^\/]*\/\.\./, "" ) if old_uri_path =~ /^\.\./
|
183
185
|
return document
|
184
186
|
rescue
|
185
187
|
p document
|
@@ -193,12 +195,17 @@ module RDig
|
|
193
195
|
return document if include_hosts.nil? || include_hosts.empty? || include_hosts.include?(document.uri.host)
|
194
196
|
end
|
195
197
|
|
196
|
-
def UrlFilters.normalize_uri(document)
|
198
|
+
def UrlFilters.normalize_uri(document, cfg)
|
197
199
|
document.uri.fragment = nil
|
198
200
|
# document.uri.query = nil
|
199
|
-
#
|
200
|
-
if
|
201
|
-
document
|
201
|
+
# trailing slash handling
|
202
|
+
if document.uri.path =~ /\/$/
|
203
|
+
# append index document if configured
|
204
|
+
if cfg.index_document
|
205
|
+
document.uri.path << RDig.config.index_document
|
206
|
+
elsif cfg.remove_trailing_slash
|
207
|
+
document.uri.path.gsub! /\/$/, ''
|
208
|
+
end
|
202
209
|
end
|
203
210
|
return document
|
204
211
|
end
|
data/rakefile
CHANGED
@@ -132,6 +132,7 @@ else
|
|
132
132
|
|
133
133
|
s.add_dependency('ferret', '>= 0.10.0')
|
134
134
|
s.add_dependency('hpricot', '>= 0.6')
|
135
|
+
s.add_dependency('htmlentities', '>= 4.0.0')
|
135
136
|
#s.requirements << ""
|
136
137
|
|
137
138
|
#### Which files are to be included in this gem? Everything! (Except CVS directories.)
|
@@ -321,7 +322,7 @@ task :tag => [:prerelease] do
|
|
321
322
|
if ENV['RELTEST']
|
322
323
|
announce "Release Task Testing, skipping tagging"
|
323
324
|
else
|
324
|
-
sh %{
|
325
|
+
sh %{svn copy svn+ssh://jkraemer@rubyforge.org/var/svn/rdig/trunk svn+ssh://jkraemer@rubyforge.org/var/svn/rdig/tags/#{reltag}}
|
325
326
|
end
|
326
327
|
end
|
327
328
|
|
@@ -0,0 +1,13 @@
|
|
1
|
+
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
|
2
|
+
<html>
|
3
|
+
<head>
|
4
|
+
<meta http-equiv="content-type" content="text/html;charset=iso-8859-1">
|
5
|
+
<title>Sample & Title</title>
|
6
|
+
</head>
|
7
|
+
<body>
|
8
|
+
<frameset>
|
9
|
+
<frame src="http://test.host/first.html" />
|
10
|
+
<frame src="/second.html" />
|
11
|
+
</frameset>
|
12
|
+
</body>
|
13
|
+
</html>
|
@@ -0,0 +1,13 @@
|
|
1
|
+
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
|
2
|
+
<html>
|
3
|
+
<head>
|
4
|
+
<meta http-equiv="content-type" content="text/html;charset=iso-8859-1">
|
5
|
+
<title>Sample & Title</title>
|
6
|
+
</head>
|
7
|
+
<body>
|
8
|
+
<map>
|
9
|
+
<area href="http://test.host/first.html" />
|
10
|
+
<area href="/second.html" />
|
11
|
+
</map>
|
12
|
+
</body>
|
13
|
+
</html>
|
@@ -52,6 +52,18 @@ class HpricotContentExtractorTest < Test::Unit::TestCase
|
|
52
52
|
assert_equal '/inside.html', result[:links][1]
|
53
53
|
assert_equal '/footer.html', result[:links][2]
|
54
54
|
end
|
55
|
+
|
56
|
+
def test_extracts_links_from_frameset
|
57
|
+
result = @extractor.process(html_doc('frameset'))
|
58
|
+
assert_equal 'http://test.host/first.html', result[:links].first
|
59
|
+
assert_equal '/second.html', result[:links].last
|
60
|
+
end
|
61
|
+
|
62
|
+
def test_extracts_links_from_imagemap
|
63
|
+
result = @extractor.process(html_doc('imagemap'))
|
64
|
+
assert_equal 'http://test.host/first.html', result[:links].first
|
65
|
+
assert_equal '/second.html', result[:links].last
|
66
|
+
end
|
55
67
|
|
56
68
|
|
57
69
|
def test_title_from_dcmeta
|
data/test/unit/searcher_test.rb
CHANGED
@@ -28,8 +28,8 @@ class SearcherTest < Test::Unit::TestCase
|
|
28
28
|
|
29
29
|
def test_search
|
30
30
|
result = RDig.searcher.search 'some sample text'
|
31
|
-
assert_equal
|
32
|
-
assert_equal
|
31
|
+
assert_equal 5, result[:hitcount]
|
32
|
+
assert_equal 5, result[:list].size
|
33
33
|
end
|
34
34
|
|
35
35
|
end
|
@@ -74,7 +74,7 @@ class UrlFiltersTest < Test::Unit::TestCase
|
|
74
74
|
doc = Document.create('http://test.host/dir/file.html')
|
75
75
|
assert_equal('http://test.host/dir/another.html',
|
76
76
|
UrlFilters.fix_relative_uri(doc.create_child('another.html')).uri.to_s)
|
77
|
-
assert_equal('http://test.host/
|
77
|
+
assert_equal('http://test.host/another.html',
|
78
78
|
UrlFilters.fix_relative_uri(doc.create_child('../another.html')).uri.to_s)
|
79
79
|
assert_equal('http://test.host/dir/another.html',
|
80
80
|
UrlFilters.fix_relative_uri(doc.create_child('/dir/another.html')).uri.to_s)
|
metadata
CHANGED
@@ -1,7 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rdig
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
|
4
|
+
hash: 3
|
5
|
+
prerelease:
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 3
|
9
|
+
- 8
|
10
|
+
version: 0.3.8
|
5
11
|
platform: ruby
|
6
12
|
authors:
|
7
13
|
- Jens Kraemer
|
@@ -9,117 +15,157 @@ autorequire:
|
|
9
15
|
bindir: bin
|
10
16
|
cert_chain: []
|
11
17
|
|
12
|
-
date:
|
13
|
-
default_executable:
|
18
|
+
date: 2009-04-26 00:00:00 +02:00
|
19
|
+
default_executable:
|
14
20
|
dependencies:
|
15
21
|
- !ruby/object:Gem::Dependency
|
16
22
|
name: ferret
|
17
|
-
|
18
|
-
|
23
|
+
prerelease: false
|
24
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
19
26
|
requirements:
|
20
27
|
- - ">="
|
21
28
|
- !ruby/object:Gem::Version
|
22
|
-
|
23
|
-
|
29
|
+
hash: 63
|
30
|
+
segments:
|
31
|
+
- 0
|
32
|
+
- 11
|
33
|
+
- 6
|
34
|
+
version: 0.11.6
|
35
|
+
type: :runtime
|
36
|
+
version_requirements: *id001
|
24
37
|
- !ruby/object:Gem::Dependency
|
25
38
|
name: hpricot
|
26
|
-
|
27
|
-
|
39
|
+
prerelease: false
|
40
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
28
42
|
requirements:
|
29
43
|
- - ">="
|
30
44
|
- !ruby/object:Gem::Version
|
45
|
+
hash: 7
|
46
|
+
segments:
|
47
|
+
- 0
|
48
|
+
- 6
|
31
49
|
version: "0.6"
|
32
|
-
|
33
|
-
|
50
|
+
type: :runtime
|
51
|
+
version_requirements: *id002
|
52
|
+
- !ruby/object:Gem::Dependency
|
53
|
+
name: htmlentities
|
54
|
+
prerelease: false
|
55
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
56
|
+
none: false
|
57
|
+
requirements:
|
58
|
+
- - ">="
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
hash: 63
|
61
|
+
segments:
|
62
|
+
- 4
|
63
|
+
- 0
|
64
|
+
- 0
|
65
|
+
version: 4.0.0
|
66
|
+
type: :runtime
|
67
|
+
version_requirements: *id003
|
68
|
+
description: Website crawler and fulltext indexer.
|
34
69
|
email: jk@jkraemer.net
|
35
|
-
executables:
|
36
|
-
|
70
|
+
executables: []
|
71
|
+
|
37
72
|
extensions: []
|
38
73
|
|
39
74
|
extra_rdoc_files:
|
75
|
+
- History.txt
|
76
|
+
- Manifest.txt
|
40
77
|
- README
|
78
|
+
files:
|
41
79
|
- CHANGES
|
80
|
+
- History.txt
|
81
|
+
- install.rb
|
42
82
|
- LICENSE
|
43
|
-
-
|
44
|
-
|
83
|
+
- Manifest.txt
|
84
|
+
- rakefile
|
85
|
+
- README
|
45
86
|
- bin/rdig
|
46
|
-
-
|
47
|
-
- lib/rdig
|
48
|
-
- lib/rdig/
|
49
|
-
- lib/rdig/
|
50
|
-
- lib/rdig/crawler.rb
|
87
|
+
- doc/examples/config.rb
|
88
|
+
- lib/rdig/content_extractors/doc.rb
|
89
|
+
- lib/rdig/content_extractors/hpricot.rb
|
90
|
+
- lib/rdig/content_extractors/pdf.rb
|
51
91
|
- lib/rdig/content_extractors.rb
|
92
|
+
- lib/rdig/crawler.rb
|
93
|
+
- lib/rdig/documents.rb
|
52
94
|
- lib/rdig/file.rb
|
53
95
|
- lib/rdig/highlight.rb
|
54
|
-
- lib/rdig/
|
96
|
+
- lib/rdig/index.rb
|
55
97
|
- lib/rdig/search.rb
|
56
|
-
- lib/rdig/
|
57
|
-
- lib/rdig
|
58
|
-
- lib/rdig/content_extractors/hpricot.rb
|
59
|
-
- lib/rdig/content_extractors/pdf.rb
|
60
|
-
- lib/htmlentities
|
61
|
-
- lib/htmlentities/htmlentities.rb
|
62
|
-
- lib/htmlentities/COPYING
|
63
|
-
- lib/htmlentities/CHANGES
|
64
|
-
- lib/htmlentities/README
|
65
|
-
- test/fixtures
|
66
|
-
- test/fixtures/word
|
67
|
-
- test/fixtures/word/simple.doc
|
68
|
-
- test/fixtures/html
|
98
|
+
- lib/rdig/url_filters.rb
|
99
|
+
- lib/rdig.rb
|
69
100
|
- test/fixtures/html/custom_tag_selectors.html
|
70
|
-
- test/fixtures/html/simple.html
|
71
101
|
- test/fixtures/html/entities.html
|
72
|
-
- test/fixtures/
|
102
|
+
- test/fixtures/html/frameset.html
|
103
|
+
- test/fixtures/html/imagemap.html
|
104
|
+
- test/fixtures/html/simple.html
|
73
105
|
- test/fixtures/pdf/simple.pdf
|
74
|
-
- test/
|
106
|
+
- test/fixtures/word/simple.doc
|
107
|
+
- test/test_helper.rb
|
75
108
|
- test/unit/crawler_fs_test.rb
|
109
|
+
- test/unit/etag_filter_test.rb
|
110
|
+
- test/unit/file_document_test.rb
|
111
|
+
- test/unit/hpricot_content_extractor_test.rb
|
112
|
+
- test/unit/http_document_test.rb
|
76
113
|
- test/unit/pdf_content_extractor_test.rb
|
77
|
-
- test/unit/word_content_extractor_test.rb
|
78
114
|
- test/unit/rdig_test.rb
|
79
|
-
- test/unit/http_document_test.rb
|
80
115
|
- test/unit/searcher_test.rb
|
81
|
-
- test/unit/file_document_test.rb
|
82
116
|
- test/unit/url_filters_test.rb
|
83
|
-
- test/unit/
|
84
|
-
- test/unit/etag_filter_test.rb
|
85
|
-
- test/test_helper.rb
|
86
|
-
- doc/examples
|
87
|
-
- doc/examples/config.rb
|
88
|
-
- LICENSE
|
89
|
-
- TODO
|
90
|
-
- CHANGES
|
91
|
-
- README
|
92
|
-
- install.rb
|
93
|
-
- rakefile
|
117
|
+
- test/unit/word_content_extractor_test.rb
|
94
118
|
has_rdoc: true
|
95
|
-
homepage: http://
|
119
|
+
homepage: http://github.com/jkraemer/rdig/
|
120
|
+
licenses: []
|
121
|
+
|
96
122
|
post_install_message:
|
97
123
|
rdoc_options:
|
98
|
-
- --title
|
99
|
-
- Rake -- Ruby Make
|
100
124
|
- --main
|
101
125
|
- README
|
102
|
-
- --line-numbers
|
103
126
|
require_paths:
|
104
127
|
- lib
|
105
128
|
required_ruby_version: !ruby/object:Gem::Requirement
|
129
|
+
none: false
|
106
130
|
requirements:
|
107
131
|
- - ">="
|
108
132
|
- !ruby/object:Gem::Version
|
133
|
+
hash: 3
|
134
|
+
segments:
|
135
|
+
- 0
|
109
136
|
version: "0"
|
110
|
-
version:
|
111
137
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
138
|
+
none: false
|
112
139
|
requirements:
|
113
140
|
- - ">="
|
114
141
|
- !ruby/object:Gem::Version
|
142
|
+
hash: 3
|
143
|
+
segments:
|
144
|
+
- 0
|
115
145
|
version: "0"
|
116
|
-
version:
|
117
146
|
requirements: []
|
118
147
|
|
119
148
|
rubyforge_project: rdig
|
120
|
-
rubygems_version: 1.
|
149
|
+
rubygems_version: 1.5.3
|
121
150
|
signing_key:
|
122
|
-
specification_version:
|
123
|
-
summary:
|
124
|
-
test_files:
|
125
|
-
|
151
|
+
specification_version: 3
|
152
|
+
summary: Crawler and content extractor for building a full text index of a website's contents. Uses Ferret for indexing.
|
153
|
+
test_files:
|
154
|
+
- test/fixtures/html/custom_tag_selectors.html
|
155
|
+
- test/fixtures/html/entities.html
|
156
|
+
- test/fixtures/html/frameset.html
|
157
|
+
- test/fixtures/html/imagemap.html
|
158
|
+
- test/fixtures/html/simple.html
|
159
|
+
- test/fixtures/pdf/simple.pdf
|
160
|
+
- test/fixtures/word/simple.doc
|
161
|
+
- test/test_helper.rb
|
162
|
+
- test/unit/crawler_fs_test.rb
|
163
|
+
- test/unit/etag_filter_test.rb
|
164
|
+
- test/unit/file_document_test.rb
|
165
|
+
- test/unit/hpricot_content_extractor_test.rb
|
166
|
+
- test/unit/http_document_test.rb
|
167
|
+
- test/unit/pdf_content_extractor_test.rb
|
168
|
+
- test/unit/rdig_test.rb
|
169
|
+
- test/unit/searcher_test.rb
|
170
|
+
- test/unit/url_filters_test.rb
|
171
|
+
- test/unit/word_content_extractor_test.rb
|
data/TODO
DELETED
File without changes
|
data/lib/htmlentities/CHANGES
DELETED
@@ -1,21 +0,0 @@
|
|
1
|
-
== 2.2 (2005-11-07)
|
2
|
-
* Important bug fixes -- thanks to Moonwolf
|
3
|
-
* Decoding hexadecimal entities now accepts 'f' as a hex digit. (D'oh!)
|
4
|
-
* Decimal decoding edge cases addressed.
|
5
|
-
* Test cases added.
|
6
|
-
|
7
|
-
== 2.1 (2005-10-31)
|
8
|
-
* Removed some unnecessary code in basic entity encoding.
|
9
|
-
* Improved handling of encoding: commands are now automatically sorted, so the
|
10
|
-
user doesn't have to worry about their order.
|
11
|
-
* Now using setup.rb.
|
12
|
-
* Tests moved to separate file.
|
13
|
-
|
14
|
-
== 2.0 (2005-08-23)
|
15
|
-
* Added encoding to entities.
|
16
|
-
* Decoding interface unchanged.
|
17
|
-
* Fixed a bug with handling high codepoints.
|
18
|
-
|
19
|
-
== 1.0 (2005-08-03)
|
20
|
-
* Initial release.
|
21
|
-
* Decoding only.
|
data/lib/htmlentities/COPYING
DELETED
data/lib/htmlentities/README
DELETED
@@ -1,15 +0,0 @@
|
|
1
|
-
HTML entity encoding and decoding for Ruby
|
2
|
-
|
3
|
-
This library extends the String class to allow encoding and decoding of
|
4
|
-
HTML/XML entities from/to their corresponding UTF-8 codepoints.
|
5
|
-
|
6
|
-
To install (requires root/admin privileges):
|
7
|
-
|
8
|
-
# ruby setup.rb
|
9
|
-
|
10
|
-
To test:
|
11
|
-
|
12
|
-
$ ruby setup.rb test
|
13
|
-
|
14
|
-
Comments are welcome. Send an email to pbattley @ gmail.com.
|
15
|
-
|
@@ -1,281 +0,0 @@
|
|
1
|
-
#
|
2
|
-
# HTML entity encoding and decoding for Ruby
|
3
|
-
#
|
4
|
-
# Author:: Paul BATTLEY (pbattley @ gmail.com)
|
5
|
-
# Version:: 2.2
|
6
|
-
# Date:: 2005-11-07
|
7
|
-
#
|
8
|
-
# == About
|
9
|
-
#
|
10
|
-
# This library extends the String class to allow encoding and decoding of
|
11
|
-
# HTML/XML entities from/to their corresponding UTF-8 codepoints.
|
12
|
-
#
|
13
|
-
# == Licence
|
14
|
-
#
|
15
|
-
# Copyright (c) 2005 Paul Battley
|
16
|
-
#
|
17
|
-
# Usage of the works is permitted provided that this instrument is retained
|
18
|
-
# with the works, so that any entity that uses the works is notified of this
|
19
|
-
# instrument.
|
20
|
-
#
|
21
|
-
# DISCLAIMER: THE WORKS ARE WITHOUT WARRANTY.
|
22
|
-
#
|
23
|
-
|
24
|
-
module HTMLEntities
|
25
|
-
|
26
|
-
VERSION = '2.2'
|
27
|
-
|
28
|
-
#
|
29
|
-
# MAP is a hash of all the HTML entities I could discover, as taken
|
30
|
-
# from the w3schools page on the subject:
|
31
|
-
# http://www.w3schools.com/html/html_entitiesref.asp
|
32
|
-
# The format is 'entity name' => codepoint where entity name is given
|
33
|
-
# without the surrounding ampersand and semicolon.
|
34
|
-
#
|
35
|
-
MAP = {
|
36
|
-
'quot' => 34,
|
37
|
-
'apos' => 39,
|
38
|
-
'amp' => 38,
|
39
|
-
'lt' => 60,
|
40
|
-
'gt' => 62,
|
41
|
-
'nbsp' => 160,
|
42
|
-
'iexcl' => 161,
|
43
|
-
'curren' => 164,
|
44
|
-
'cent' => 162,
|
45
|
-
'pound' => 163,
|
46
|
-
'yen' => 165,
|
47
|
-
'brvbar' => 166,
|
48
|
-
'sect' => 167,
|
49
|
-
'uml' => 168,
|
50
|
-
'copy' => 169,
|
51
|
-
'ordf' => 170,
|
52
|
-
'laquo' => 171,
|
53
|
-
'not' => 172,
|
54
|
-
'shy' => 173,
|
55
|
-
'reg' => 174,
|
56
|
-
'trade' => 8482,
|
57
|
-
'macr' => 175,
|
58
|
-
'deg' => 176,
|
59
|
-
'plusmn' => 177,
|
60
|
-
'sup2' => 178,
|
61
|
-
'sup3' => 179,
|
62
|
-
'acute' => 180,
|
63
|
-
'micro' => 181,
|
64
|
-
'para' => 182,
|
65
|
-
'middot' => 183,
|
66
|
-
'cedil' => 184,
|
67
|
-
'sup1' => 185,
|
68
|
-
'ordm' => 186,
|
69
|
-
'raquo' => 187,
|
70
|
-
'frac14' => 188,
|
71
|
-
'frac12' => 189,
|
72
|
-
'frac34' => 190,
|
73
|
-
'iquest' => 191,
|
74
|
-
'times' => 215,
|
75
|
-
'divide' => 247,
|
76
|
-
'Agrave' => 192,
|
77
|
-
'Aacute' => 193,
|
78
|
-
'Acirc' => 194,
|
79
|
-
'Atilde' => 195,
|
80
|
-
'Auml' => 196,
|
81
|
-
'Aring' => 197,
|
82
|
-
'AElig' => 198,
|
83
|
-
'Ccedil' => 199,
|
84
|
-
'Egrave' => 200,
|
85
|
-
'Eacute' => 201,
|
86
|
-
'Ecirc' => 202,
|
87
|
-
'Euml' => 203,
|
88
|
-
'Igrave' => 204,
|
89
|
-
'Iacute' => 205,
|
90
|
-
'Icirc' => 206,
|
91
|
-
'Iuml' => 207,
|
92
|
-
'ETH' => 208,
|
93
|
-
'Ntilde' => 209,
|
94
|
-
'Ograve' => 210,
|
95
|
-
'Oacute' => 211,
|
96
|
-
'Ocirc' => 212,
|
97
|
-
'Otilde' => 213,
|
98
|
-
'Ouml' => 214,
|
99
|
-
'Oslash' => 216,
|
100
|
-
'Ugrave' => 217,
|
101
|
-
'Uacute' => 218,
|
102
|
-
'Ucirc' => 219,
|
103
|
-
'Uuml' => 220,
|
104
|
-
'Yacute' => 221,
|
105
|
-
'THORN' => 222,
|
106
|
-
'szlig' => 223,
|
107
|
-
'agrave' => 224,
|
108
|
-
'aacute' => 225,
|
109
|
-
'acirc' => 226,
|
110
|
-
'atilde' => 227,
|
111
|
-
'auml' => 228,
|
112
|
-
'aring' => 229,
|
113
|
-
'aelig' => 230,
|
114
|
-
'ccedil' => 231,
|
115
|
-
'egrave' => 232,
|
116
|
-
'eacute' => 233,
|
117
|
-
'ecirc' => 234,
|
118
|
-
'euml' => 235,
|
119
|
-
'igrave' => 236,
|
120
|
-
'iacute' => 237,
|
121
|
-
'icirc' => 238,
|
122
|
-
'iuml' => 239,
|
123
|
-
'eth' => 240,
|
124
|
-
'ntilde' => 241,
|
125
|
-
'ograve' => 242,
|
126
|
-
'oacute' => 243,
|
127
|
-
'ocirc' => 244,
|
128
|
-
'otilde' => 245,
|
129
|
-
'ouml' => 246,
|
130
|
-
'oslash' => 248,
|
131
|
-
'ugrave' => 249,
|
132
|
-
'uacute' => 250,
|
133
|
-
'ucirc' => 251,
|
134
|
-
'uuml' => 252,
|
135
|
-
'yacute' => 253,
|
136
|
-
'thorn' => 254,
|
137
|
-
'yuml' => 255,
|
138
|
-
'OElig' => 338,
|
139
|
-
'oelig' => 339,
|
140
|
-
'Scaron' => 352,
|
141
|
-
'scaron' => 353,
|
142
|
-
'Yuml' => 376,
|
143
|
-
'circ' => 710,
|
144
|
-
'tilde' => 732,
|
145
|
-
'ensp' => 8194,
|
146
|
-
'emsp' => 8195,
|
147
|
-
'thinsp' => 8201,
|
148
|
-
'zwnj' => 8204,
|
149
|
-
'zwj' => 8205,
|
150
|
-
'lrm' => 8206,
|
151
|
-
'rlm' => 8207,
|
152
|
-
'ndash' => 8211,
|
153
|
-
'mdash' => 8212,
|
154
|
-
'lsquo' => 8216,
|
155
|
-
'rsquo' => 8217,
|
156
|
-
'sbquo' => 8218,
|
157
|
-
'ldquo' => 8220,
|
158
|
-
'rdquo' => 8221,
|
159
|
-
'bdquo' => 8222,
|
160
|
-
'dagger' => 8224,
|
161
|
-
'Dagger' => 8225,
|
162
|
-
'hellip' => 8230,
|
163
|
-
'permil' => 8240,
|
164
|
-
'lsaquo' => 8249,
|
165
|
-
'rsaquo' => 8250,
|
166
|
-
'euro' => 8364
|
167
|
-
}
|
168
|
-
|
169
|
-
MIN_LENGTH = MAP.keys.map{ |a| a.length }.min
|
170
|
-
MAX_LENGTH = MAP.keys.map{ |a| a.length }.max
|
171
|
-
|
172
|
-
# Precompile the regexp
|
173
|
-
NAMED_ENTITY_REGEXP =
|
174
|
-
/&([a-z]{#{HTMLEntities::MIN_LENGTH},#{HTMLEntities::MAX_LENGTH}});/i
|
175
|
-
|
176
|
-
# Reverse map for converting characters to named entities
|
177
|
-
REVERSE_MAP = MAP.invert
|
178
|
-
|
179
|
-
BASIC_ENTITY_REGEXP = /[<>'"&]/
|
180
|
-
|
181
|
-
UTF8_NON_ASCII_REGEXP = /[\x00-\x1f]|[\xc0-\xfd][\x80-\xbf]+/
|
182
|
-
|
183
|
-
end
|
184
|
-
|
185
|
-
class String
|
186
|
-
|
187
|
-
# Because there's no need to make the user worry about the order here,
|
188
|
-
# let's handle it.
|
189
|
-
ENCODE_ENTITIES_COMMAND_ORDER = {
|
190
|
-
:basic => 0,
|
191
|
-
:named => 1,
|
192
|
-
:decimal => 2,
|
193
|
-
:hexadecimal => 3
|
194
|
-
}
|
195
|
-
|
196
|
-
#
|
197
|
-
# Decode XML and HTML 4.01 entities in a string into their UTF-8
|
198
|
-
# equivalents. Obviously, if your string is not already in UTF-8, you'd
|
199
|
-
# better convert it before using this method, or the output will be mixed
|
200
|
-
# up.
|
201
|
-
# Unknown named entities are not converted
|
202
|
-
#
|
203
|
-
def decode_entities
|
204
|
-
return gsub(HTMLEntities::NAMED_ENTITY_REGEXP) {
|
205
|
-
HTMLEntities::MAP.has_key?($1) ? [HTMLEntities::MAP[$1]].pack('U') : $&
|
206
|
-
}.gsub(/&#([0-9]{1,7});/) {
|
207
|
-
[$1.to_i].pack('U')
|
208
|
-
}.gsub(/&#x([0-9a-f]{1,6});/i) {
|
209
|
-
[$1.to_i(16)].pack('U')
|
210
|
-
}
|
211
|
-
end
|
212
|
-
|
213
|
-
#
|
214
|
-
# Encode codepoints into their corresponding entities. Various operations
|
215
|
-
# are possible, and may be specified in order:
|
216
|
-
#
|
217
|
-
# :basic :: Convert the five XML entities ('"<>&)
|
218
|
-
# :named :: Convert non-ASCII characters to their named HTML 4.01 equivalent
|
219
|
-
# :decimal :: Convert non-ASCII characters to decimal entities (e.g. Ӓ)
|
220
|
-
# :hexadecimal :: Convert non-ASCII characters to hexadecimal entities (e.g. # ካ)
|
221
|
-
#
|
222
|
-
# You can specify the commands in any order, but they will be executed in
|
223
|
-
# the order listed above to ensure that entity ampersands are not
|
224
|
-
# clobbered and that named entities are replaced before numeric ones.
|
225
|
-
#
|
226
|
-
# If no instructions are specified, :basic will be used.
|
227
|
-
#
|
228
|
-
# Examples:
|
229
|
-
# str.encode_entities - XML-safe
|
230
|
-
# str.encode_entities(:basic, :decimal) - XML-safe and 7-bit clean
|
231
|
-
# str.encode_entities(:basic, :named, :decimal) - 7-bit clean, with all
|
232
|
-
# non-ASCII characters replaced with their named entity where possible, and
|
233
|
-
# decimal equivalents otherwise.
|
234
|
-
#
|
235
|
-
# Note: It is the program's responsibility to ensure that the string
|
236
|
-
# contains valid UTF-8 before calling this method.
|
237
|
-
#
|
238
|
-
def encode_entities(*instructions)
|
239
|
-
str = nil
|
240
|
-
if (instructions.empty?)
|
241
|
-
instructions = [:basic]
|
242
|
-
else
|
243
|
-
instructions.each do |instr|
|
244
|
-
unless ENCODE_ENTITIES_COMMAND_ORDER[instr]
|
245
|
-
raise RuntimeError, "unknown encode_entities command `#{instr.inspect}'"
|
246
|
-
end
|
247
|
-
end
|
248
|
-
instructions.sort! { |a,b|
|
249
|
-
ENCODE_ENTITIES_COMMAND_ORDER[a] <=>
|
250
|
-
ENCODE_ENTITIES_COMMAND_ORDER[b]
|
251
|
-
}
|
252
|
-
end
|
253
|
-
instructions.each do |instruction|
|
254
|
-
case instruction
|
255
|
-
when :basic
|
256
|
-
# Handled as basic ASCII
|
257
|
-
str = (str || self).gsub(HTMLEntities::BASIC_ENTITY_REGEXP) {
|
258
|
-
# It's safe to use the simpler [0] here because we know
|
259
|
-
# that the basic entities are ASCII.
|
260
|
-
'&' << HTMLEntities::REVERSE_MAP[$&[0]] << ';'
|
261
|
-
}
|
262
|
-
when :named
|
263
|
-
# Test everything except printable ASCII
|
264
|
-
str = (str || self).gsub(HTMLEntities::UTF8_NON_ASCII_REGEXP) {
|
265
|
-
cp = $&.unpack('U')[0]
|
266
|
-
(e = HTMLEntities::REVERSE_MAP[cp]) ? "&#{e};" : $&
|
267
|
-
}
|
268
|
-
when :decimal
|
269
|
-
str = (str || self).gsub(HTMLEntities::UTF8_NON_ASCII_REGEXP) {
|
270
|
-
"&##{$&.unpack('U')[0]};"
|
271
|
-
}
|
272
|
-
when :hexadecimal
|
273
|
-
str = (str || self).gsub(HTMLEntities::UTF8_NON_ASCII_REGEXP) {
|
274
|
-
"&#x#{$&.unpack('U')[0].to_s(16)};"
|
275
|
-
}
|
276
|
-
end
|
277
|
-
end
|
278
|
-
return str
|
279
|
-
end
|
280
|
-
|
281
|
-
end
|