rdig 0.3.5 → 0.3.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGES +6 -1
- data/History.txt +11 -0
- data/Manifest.txt +39 -0
- data/doc/examples/config.rb +10 -0
- data/lib/rdig.rb +12 -8
- data/lib/rdig/content_extractors.rb +7 -1
- data/lib/rdig/content_extractors/doc.rb +5 -19
- data/lib/rdig/content_extractors/hpricot.rb +13 -9
- data/lib/rdig/crawler.rb +18 -10
- data/lib/rdig/documents.rb +13 -9
- data/lib/rdig/url_filters.rb +15 -8
- data/rakefile +2 -1
- data/test/fixtures/html/frameset.html +13 -0
- data/test/fixtures/html/imagemap.html +13 -0
- data/test/unit/hpricot_content_extractor_test.rb +12 -0
- data/test/unit/searcher_test.rb +2 -2
- data/test/unit/url_filters_test.rb +1 -1
- metadata +108 -62
- data/TODO +0 -0
- data/lib/htmlentities/CHANGES +0 -21
- data/lib/htmlentities/COPYING +0 -7
- data/lib/htmlentities/README +0 -15
- data/lib/htmlentities/htmlentities.rb +0 -281
data/CHANGES
CHANGED
@@ -1,8 +1,13 @@
|
|
1
|
+
0.3.6
|
2
|
+
- remove bundled htmlentities in favor of a gem dependency
|
3
|
+
- also extract links from area and frame tags
|
4
|
+
- fix etagfilter bug
|
5
|
+
|
1
6
|
0.3.5
|
2
7
|
- Add max_depth option to crawler configuration for limiting the crawl to a
|
3
8
|
specific depth
|
4
9
|
- add support for http proxies including basic authentication
|
5
|
-
- remove
|
10
|
+
- remove rubyful_soup support
|
6
11
|
|
7
12
|
0.3.4
|
8
13
|
|
data/History.txt
ADDED
@@ -0,0 +1,11 @@
|
|
1
|
+
== 0.3.8 2009-04-26
|
2
|
+
|
3
|
+
* bump up version
|
4
|
+
|
5
|
+
== 0.3.7 2009-04-26
|
6
|
+
|
7
|
+
* Gem spec for automatic gem building on github
|
8
|
+
* doc enhancements
|
9
|
+
* better uri-normalization, re-add result uri of redirection
|
10
|
+
into the queue instea of directly indexing the resulting
|
11
|
+
page
|
data/Manifest.txt
ADDED
@@ -0,0 +1,39 @@
|
|
1
|
+
CHANGES
|
2
|
+
History.txt
|
3
|
+
install.rb
|
4
|
+
LICENSE
|
5
|
+
Manifest.txt
|
6
|
+
rakefile
|
7
|
+
README
|
8
|
+
bin/rdig
|
9
|
+
doc/examples/config.rb
|
10
|
+
lib/rdig/content_extractors/doc.rb
|
11
|
+
lib/rdig/content_extractors/hpricot.rb
|
12
|
+
lib/rdig/content_extractors/pdf.rb
|
13
|
+
lib/rdig/content_extractors.rb
|
14
|
+
lib/rdig/crawler.rb
|
15
|
+
lib/rdig/documents.rb
|
16
|
+
lib/rdig/file.rb
|
17
|
+
lib/rdig/highlight.rb
|
18
|
+
lib/rdig/index.rb
|
19
|
+
lib/rdig/search.rb
|
20
|
+
lib/rdig/url_filters.rb
|
21
|
+
lib/rdig.rb
|
22
|
+
test/fixtures/html/custom_tag_selectors.html
|
23
|
+
test/fixtures/html/entities.html
|
24
|
+
test/fixtures/html/frameset.html
|
25
|
+
test/fixtures/html/imagemap.html
|
26
|
+
test/fixtures/html/simple.html
|
27
|
+
test/fixtures/pdf/simple.pdf
|
28
|
+
test/fixtures/word/simple.doc
|
29
|
+
test/test_helper.rb
|
30
|
+
test/unit/crawler_fs_test.rb
|
31
|
+
test/unit/etag_filter_test.rb
|
32
|
+
test/unit/file_document_test.rb
|
33
|
+
test/unit/hpricot_content_extractor_test.rb
|
34
|
+
test/unit/http_document_test.rb
|
35
|
+
test/unit/pdf_content_extractor_test.rb
|
36
|
+
test/unit/rdig_test.rb
|
37
|
+
test/unit/searcher_test.rb
|
38
|
+
test/unit/url_filters_test.rb
|
39
|
+
test/unit/word_content_extractor_test.rb
|
data/doc/examples/config.rb
CHANGED
@@ -86,6 +86,13 @@ RDig.configuration do |cfg|
|
|
86
86
|
# Set to 0 to only index the start_urls.
|
87
87
|
# cfg.crawler.max_depth = nil
|
88
88
|
|
89
|
+
# default index document to be appended to URIs ending with a trailing '/'
|
90
|
+
# cfg.crawler.normalize_uri.index_document = nil
|
91
|
+
# strip trailing '/' from URIs to avoid double indexing of pages referred by '
|
92
|
+
# Ignored if index_document is set.
|
93
|
+
# Not necessary when the server issues proper etags since the default etag filter will kill these doublettes.
|
94
|
+
# cfg.crawler.normalize_uri.remove_trailing_slash = nil
|
95
|
+
|
89
96
|
# http proxy configuration
|
90
97
|
# proxy url
|
91
98
|
# cfg.crawler.http_proxy = nil
|
@@ -94,6 +101,9 @@ RDig.configuration do |cfg|
|
|
94
101
|
# cfg.crawler.http_proxy_user = nil
|
95
102
|
# proxy password
|
96
103
|
# cfg.crawler.http_proxy_pass = nil
|
104
|
+
#
|
105
|
+
# to use basic auth without a proxy, use this syntax:
|
106
|
+
# cfg.crawler.open_uri_http_options = { :http_basic_authentication => [user, password] }
|
97
107
|
|
98
108
|
# indexer options
|
99
109
|
|
data/lib/rdig.rb
CHANGED
@@ -24,7 +24,7 @@
|
|
24
24
|
#++
|
25
25
|
#
|
26
26
|
|
27
|
-
RDIGVERSION = '0.3.
|
27
|
+
RDIGVERSION = '0.3.8'
|
28
28
|
|
29
29
|
|
30
30
|
require 'thread'
|
@@ -49,7 +49,8 @@ rescue LoadError
|
|
49
49
|
require 'ferret'
|
50
50
|
end
|
51
51
|
|
52
|
-
|
52
|
+
|
53
|
+
#require 'htmlentities/htmlentities'
|
53
54
|
|
54
55
|
|
55
56
|
$KCODE = 'u'
|
@@ -60,17 +61,16 @@ module RDig
|
|
60
61
|
|
61
62
|
class << self
|
62
63
|
|
63
|
-
#
|
64
|
-
#
|
65
|
-
#
|
66
|
-
# a document has to survive all filters in the chain to get indexed.
|
64
|
+
# Filter chains are used by the crawler to limit the set of documents being indexed.
|
65
|
+
# There are two chains - one for http, and one for file system crawling.
|
66
|
+
# Each document has to survive all filters in the relevant chain to get indexed.
|
67
67
|
def filter_chain
|
68
68
|
@filter_chain ||= {
|
69
69
|
# filter chain for http crawling
|
70
70
|
:http => [
|
71
71
|
:scheme_filter_http,
|
72
72
|
:fix_relative_uri,
|
73
|
-
:normalize_uri,
|
73
|
+
{ :normalize_uri => :normalize_uri },
|
74
74
|
{ RDig::UrlFilters::DepthFilter => :max_depth },
|
75
75
|
{ :hostname_filter => :include_hosts },
|
76
76
|
{ RDig::UrlFilters::UrlInclusionFilter => :include_documents },
|
@@ -120,7 +120,11 @@ module RDig
|
|
120
120
|
:wait_before_leave => 10,
|
121
121
|
:http_proxy => nil,
|
122
122
|
:http_proxy_user => nil,
|
123
|
-
:http_proxy_pass => nil
|
123
|
+
:http_proxy_pass => nil,
|
124
|
+
:normalize_uri => OpenStruct.new(
|
125
|
+
:index_document => nil,
|
126
|
+
:remove_trailing_slash => nil
|
127
|
+
)
|
124
128
|
),
|
125
129
|
:content_extraction => OpenStruct.new(
|
126
130
|
# settings for html content extraction (hpricot)
|
@@ -23,7 +23,13 @@ module RDig
|
|
23
23
|
def self.extractor_instances
|
24
24
|
@@extractor_instances ||= extractors.map { |ex_class|
|
25
25
|
RDig.logger.info "initializing content extractor: #{ex_class}"
|
26
|
-
|
26
|
+
ex = nil
|
27
|
+
begin
|
28
|
+
ex = ex_class.new(RDig.configuration.content_extraction)
|
29
|
+
rescue Exception
|
30
|
+
RDig.logger.error "error: #{$!.message}\n#{$!.backtrace.join("\n")}"
|
31
|
+
end
|
32
|
+
ex
|
27
33
|
}.compact
|
28
34
|
end
|
29
35
|
|
@@ -13,25 +13,11 @@ module RDig
|
|
13
13
|
@wvhtml = 'wvHtml'
|
14
14
|
@pattern = /^application\/msword/
|
15
15
|
# html extractor for parsing wvHtml output
|
16
|
-
|
17
|
-
|
18
|
-
:
|
19
|
-
|
20
|
-
|
21
|
-
)))
|
22
|
-
elsif defined?(RubyfulSoupContentExtractor)
|
23
|
-
@html_extractor = RubyfulSoupContentExtractor.new(OpenStruct.new(
|
24
|
-
:rubyful_soup => OpenStruct.new(
|
25
|
-
:content_tag_selector => lambda { |tagsoup|
|
26
|
-
tagsoup.html.body
|
27
|
-
},
|
28
|
-
:title_tag_selector => lambda { |tagsoup|
|
29
|
-
tagsoup.html.head.title
|
30
|
-
}
|
31
|
-
)))
|
32
|
-
else
|
33
|
-
raise "need at least one html content extractor - please install hpricot or rubyful_soup"
|
34
|
-
end
|
16
|
+
@html_extractor = HpricotContentExtractor.new(OpenStruct.new(
|
17
|
+
:hpricot => OpenStruct.new(
|
18
|
+
:content_tag_selector => 'body',
|
19
|
+
:title_tag_selector => 'title'
|
20
|
+
)))
|
35
21
|
# TODO: better: if $?.exitstatus == 127 (not found)
|
36
22
|
@available = %x{#{@wvhtml} -h 2>&1} =~ /Dom Lachowicz/
|
37
23
|
end
|
@@ -1,11 +1,12 @@
|
|
1
1
|
begin
|
2
2
|
require 'hpricot'
|
3
|
+
require 'htmlentities'
|
3
4
|
rescue LoadError
|
4
5
|
require 'rubygems'
|
5
6
|
require 'hpricot'
|
7
|
+
require 'htmlentities'
|
6
8
|
end
|
7
9
|
|
8
|
-
if defined?(Hpricot)
|
9
10
|
module RDig
|
10
11
|
module ContentExtractors
|
11
12
|
|
@@ -23,11 +24,12 @@ module RDig
|
|
23
24
|
# :title => 'Title',
|
24
25
|
# :links => [array of urls] }
|
25
26
|
def process(content)
|
27
|
+
entities = HTMLEntities.new
|
26
28
|
doc = Hpricot(content)
|
27
29
|
{
|
28
|
-
:title => extract_title(doc).
|
30
|
+
:title => entities.decode(extract_title(doc)).strip,
|
29
31
|
:links => extract_links(doc),
|
30
|
-
:content => extract_content(doc)
|
32
|
+
:content => entities.decode(extract_content(doc))
|
31
33
|
}
|
32
34
|
end
|
33
35
|
|
@@ -50,12 +52,14 @@ module RDig
|
|
50
52
|
# extracts the href attributes of all a tags, except
|
51
53
|
# internal links like <a href="#top">
|
52
54
|
def extract_links(doc)
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
55
|
+
{'a' => 'href', 'area' => 'href', 'frame' => 'src'}.map do |tag, attr|
|
56
|
+
(doc/tag).map do |tag|
|
57
|
+
value = tag[attr]
|
58
|
+
CGI.unescapeHTML(value) if value && value !~ /^#/
|
59
|
+
end
|
60
|
+
end.flatten.compact
|
57
61
|
end
|
58
|
-
|
62
|
+
|
59
63
|
# Extracts the title from the given html tree
|
60
64
|
def extract_title(doc)
|
61
65
|
the_title_tag = title_tag(doc)
|
@@ -85,6 +89,7 @@ module RDig
|
|
85
89
|
def strip_comments(string)
|
86
90
|
string.gsub Regexp.new('<!--.*?-->', Regexp::MULTILINE, 'u'), ''
|
87
91
|
end
|
92
|
+
|
88
93
|
def strip_tags(string)
|
89
94
|
string.gsub! Regexp.new('<(script|style).*?>.*?<\/(script|style).*?>',
|
90
95
|
Regexp::MULTILINE, 'u'), ''
|
@@ -98,4 +103,3 @@ module RDig
|
|
98
103
|
|
99
104
|
end
|
100
105
|
end
|
101
|
-
end
|
data/lib/rdig/crawler.rb
CHANGED
@@ -5,7 +5,6 @@ module RDig
|
|
5
5
|
|
6
6
|
def initialize(config = RDig.config, logger = RDig.logger)
|
7
7
|
@documents = Queue.new
|
8
|
-
@etag_filter = ETagFilter.new
|
9
8
|
@logger = logger
|
10
9
|
@config = config
|
11
10
|
end
|
@@ -22,7 +21,8 @@ module RDig
|
|
22
21
|
# check whether we are indexing on-disk or via http
|
23
22
|
url_type = @config.crawler.start_urls.first =~ /^file:\/\// ? :file : :http
|
24
23
|
chain_config = RDig.filter_chain[url_type]
|
25
|
-
|
24
|
+
|
25
|
+
@etag_filter = ETagFilter.new
|
26
26
|
filterchain = UrlFilters::FilterChain.new(chain_config)
|
27
27
|
@config.crawler.start_urls.each { |url| add_url(url, filterchain) }
|
28
28
|
|
@@ -52,13 +52,21 @@ module RDig
|
|
52
52
|
def process_document(doc, filterchain)
|
53
53
|
@logger.debug "processing document #{doc}"
|
54
54
|
doc.fetch
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
55
|
+
case doc.status
|
56
|
+
when :success
|
57
|
+
if @etag_filter.apply(doc)
|
58
|
+
# add links from this document to the queue
|
59
|
+
doc.content[:links].each { |url|
|
60
|
+
add_url(url, filterchain, doc)
|
61
|
+
} unless doc.content[:links].nil?
|
62
|
+
add_to_index doc
|
63
|
+
end
|
64
|
+
when :redirect
|
65
|
+
@logger.debug "redirect to #{doc.content}"
|
66
|
+
add_url(doc.content, filterchain, doc)
|
67
|
+
else
|
68
|
+
@logger.error "unknown doc status #{doc.status}: #{doc}"
|
69
|
+
end
|
62
70
|
rescue
|
63
71
|
@logger.error "error processing document #{doc.uri.to_s}: #{$!}"
|
64
72
|
@logger.debug "Trace: #{$!.backtrace.join("\n")}"
|
@@ -110,7 +118,7 @@ module RDig
|
|
110
118
|
end
|
111
119
|
|
112
120
|
def apply(document)
|
113
|
-
return document unless (document.respond_to?(:etag) && document.etag)
|
121
|
+
return document unless (document.respond_to?(:etag) && document.etag && !document.etag.empty?)
|
114
122
|
synchronize do
|
115
123
|
@etags.add?(document.etag) ? document : nil
|
116
124
|
end
|
data/lib/rdig/documents.rb
CHANGED
@@ -118,16 +118,20 @@ module RDig
|
|
118
118
|
def fetch
|
119
119
|
RDig.logger.debug "fetching #{@uri.to_s}"
|
120
120
|
open(@uri.to_s, RDig::open_uri_http_options) do |doc|
|
121
|
-
|
122
|
-
|
123
|
-
@
|
124
|
-
# puts "etag: #{@etag}"
|
125
|
-
@content = ContentExtractors.process(doc.read, doc.content_type)
|
126
|
-
@status = :success
|
127
|
-
when 404
|
128
|
-
RDig.logger.info "got 404 for #{@uri}"
|
121
|
+
if @uri.to_s != doc.base_uri.to_s
|
122
|
+
@status = :redirect
|
123
|
+
@content = doc.base_uri
|
129
124
|
else
|
130
|
-
|
125
|
+
case doc.status.first.to_i
|
126
|
+
when 200
|
127
|
+
@etag = doc.meta['etag']
|
128
|
+
@content = ContentExtractors.process(doc.read, doc.content_type)
|
129
|
+
@status = :success
|
130
|
+
when 404
|
131
|
+
RDig.logger.info "got 404 for #{@uri}"
|
132
|
+
else
|
133
|
+
RDig.logger.info "don't know what to do with response: #{doc.status.join(' : ')}"
|
134
|
+
end
|
131
135
|
end
|
132
136
|
end
|
133
137
|
rescue
|
data/lib/rdig/url_filters.rb
CHANGED
@@ -22,7 +22,7 @@ module RDig
|
|
22
22
|
end
|
23
23
|
|
24
24
|
# add a filter and it's args to the chain
|
25
|
-
#
|
25
|
+
# if args is a symbol, it is treated as a configuration key
|
26
26
|
def add(filter, args=nil)
|
27
27
|
args = RDig.config.crawler.send(args) if args.is_a? Symbol
|
28
28
|
case filter
|
@@ -163,7 +163,7 @@ module RDig
|
|
163
163
|
return document
|
164
164
|
end
|
165
165
|
|
166
|
-
# expands
|
166
|
+
# expands href="/path/xyz.html", href="affe.html" and href="../lala.html"
|
167
167
|
# to full urls
|
168
168
|
def UrlFilters.fix_relative_uri(document)
|
169
169
|
#return nil unless document.uri.scheme.nil? || document.uri.scheme =~ /^https?/i
|
@@ -175,11 +175,13 @@ module RDig
|
|
175
175
|
uri.port = ref.port unless uri.port || ref.port==ref.default_port
|
176
176
|
uri.path = ref.path unless uri.path
|
177
177
|
|
178
|
-
|
178
|
+
old_uri_path = uri.path
|
179
|
+
if uri.path !~ /^\// || uri.path =~ /^\.\./
|
179
180
|
ref_path = ref.path || '/'
|
180
181
|
ref_path << '/' if ref_path.empty?
|
181
182
|
uri.path = ref_path[0..ref_path.rindex('/')] + uri.path
|
182
|
-
end
|
183
|
+
end
|
184
|
+
uri.path = uri.path.sub( /\/[^\/]*\/\.\./, "" ) if old_uri_path =~ /^\.\./
|
183
185
|
return document
|
184
186
|
rescue
|
185
187
|
p document
|
@@ -193,12 +195,17 @@ module RDig
|
|
193
195
|
return document if include_hosts.nil? || include_hosts.empty? || include_hosts.include?(document.uri.host)
|
194
196
|
end
|
195
197
|
|
196
|
-
def UrlFilters.normalize_uri(document)
|
198
|
+
def UrlFilters.normalize_uri(document, cfg)
|
197
199
|
document.uri.fragment = nil
|
198
200
|
# document.uri.query = nil
|
199
|
-
#
|
200
|
-
if
|
201
|
-
document
|
201
|
+
# trailing slash handling
|
202
|
+
if document.uri.path =~ /\/$/
|
203
|
+
# append index document if configured
|
204
|
+
if cfg.index_document
|
205
|
+
document.uri.path << RDig.config.index_document
|
206
|
+
elsif cfg.remove_trailing_slash
|
207
|
+
document.uri.path.gsub! /\/$/, ''
|
208
|
+
end
|
202
209
|
end
|
203
210
|
return document
|
204
211
|
end
|
data/rakefile
CHANGED
@@ -132,6 +132,7 @@ else
|
|
132
132
|
|
133
133
|
s.add_dependency('ferret', '>= 0.10.0')
|
134
134
|
s.add_dependency('hpricot', '>= 0.6')
|
135
|
+
s.add_dependency('htmlentities', '>= 4.0.0')
|
135
136
|
#s.requirements << ""
|
136
137
|
|
137
138
|
#### Which files are to be included in this gem? Everything! (Except CVS directories.)
|
@@ -321,7 +322,7 @@ task :tag => [:prerelease] do
|
|
321
322
|
if ENV['RELTEST']
|
322
323
|
announce "Release Task Testing, skipping tagging"
|
323
324
|
else
|
324
|
-
sh %{
|
325
|
+
sh %{svn copy svn+ssh://jkraemer@rubyforge.org/var/svn/rdig/trunk svn+ssh://jkraemer@rubyforge.org/var/svn/rdig/tags/#{reltag}}
|
325
326
|
end
|
326
327
|
end
|
327
328
|
|
@@ -0,0 +1,13 @@
|
|
1
|
+
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
|
2
|
+
<html>
|
3
|
+
<head>
|
4
|
+
<meta http-equiv="content-type" content="text/html;charset=iso-8859-1">
|
5
|
+
<title>Sample & Title</title>
|
6
|
+
</head>
|
7
|
+
<body>
|
8
|
+
<frameset>
|
9
|
+
<frame src="http://test.host/first.html" />
|
10
|
+
<frame src="/second.html" />
|
11
|
+
</frameset>
|
12
|
+
</body>
|
13
|
+
</html>
|
@@ -0,0 +1,13 @@
|
|
1
|
+
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
|
2
|
+
<html>
|
3
|
+
<head>
|
4
|
+
<meta http-equiv="content-type" content="text/html;charset=iso-8859-1">
|
5
|
+
<title>Sample & Title</title>
|
6
|
+
</head>
|
7
|
+
<body>
|
8
|
+
<map>
|
9
|
+
<area href="http://test.host/first.html" />
|
10
|
+
<area href="/second.html" />
|
11
|
+
</map>
|
12
|
+
</body>
|
13
|
+
</html>
|
@@ -52,6 +52,18 @@ class HpricotContentExtractorTest < Test::Unit::TestCase
|
|
52
52
|
assert_equal '/inside.html', result[:links][1]
|
53
53
|
assert_equal '/footer.html', result[:links][2]
|
54
54
|
end
|
55
|
+
|
56
|
+
def test_extracts_links_from_frameset
|
57
|
+
result = @extractor.process(html_doc('frameset'))
|
58
|
+
assert_equal 'http://test.host/first.html', result[:links].first
|
59
|
+
assert_equal '/second.html', result[:links].last
|
60
|
+
end
|
61
|
+
|
62
|
+
def test_extracts_links_from_imagemap
|
63
|
+
result = @extractor.process(html_doc('imagemap'))
|
64
|
+
assert_equal 'http://test.host/first.html', result[:links].first
|
65
|
+
assert_equal '/second.html', result[:links].last
|
66
|
+
end
|
55
67
|
|
56
68
|
|
57
69
|
def test_title_from_dcmeta
|
data/test/unit/searcher_test.rb
CHANGED
@@ -28,8 +28,8 @@ class SearcherTest < Test::Unit::TestCase
|
|
28
28
|
|
29
29
|
def test_search
|
30
30
|
result = RDig.searcher.search 'some sample text'
|
31
|
-
assert_equal
|
32
|
-
assert_equal
|
31
|
+
assert_equal 5, result[:hitcount]
|
32
|
+
assert_equal 5, result[:list].size
|
33
33
|
end
|
34
34
|
|
35
35
|
end
|
@@ -74,7 +74,7 @@ class UrlFiltersTest < Test::Unit::TestCase
|
|
74
74
|
doc = Document.create('http://test.host/dir/file.html')
|
75
75
|
assert_equal('http://test.host/dir/another.html',
|
76
76
|
UrlFilters.fix_relative_uri(doc.create_child('another.html')).uri.to_s)
|
77
|
-
assert_equal('http://test.host/
|
77
|
+
assert_equal('http://test.host/another.html',
|
78
78
|
UrlFilters.fix_relative_uri(doc.create_child('../another.html')).uri.to_s)
|
79
79
|
assert_equal('http://test.host/dir/another.html',
|
80
80
|
UrlFilters.fix_relative_uri(doc.create_child('/dir/another.html')).uri.to_s)
|
metadata
CHANGED
@@ -1,7 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rdig
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
|
4
|
+
hash: 3
|
5
|
+
prerelease:
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 3
|
9
|
+
- 8
|
10
|
+
version: 0.3.8
|
5
11
|
platform: ruby
|
6
12
|
authors:
|
7
13
|
- Jens Kraemer
|
@@ -9,117 +15,157 @@ autorequire:
|
|
9
15
|
bindir: bin
|
10
16
|
cert_chain: []
|
11
17
|
|
12
|
-
date:
|
13
|
-
default_executable:
|
18
|
+
date: 2009-04-26 00:00:00 +02:00
|
19
|
+
default_executable:
|
14
20
|
dependencies:
|
15
21
|
- !ruby/object:Gem::Dependency
|
16
22
|
name: ferret
|
17
|
-
|
18
|
-
|
23
|
+
prerelease: false
|
24
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
19
26
|
requirements:
|
20
27
|
- - ">="
|
21
28
|
- !ruby/object:Gem::Version
|
22
|
-
|
23
|
-
|
29
|
+
hash: 63
|
30
|
+
segments:
|
31
|
+
- 0
|
32
|
+
- 11
|
33
|
+
- 6
|
34
|
+
version: 0.11.6
|
35
|
+
type: :runtime
|
36
|
+
version_requirements: *id001
|
24
37
|
- !ruby/object:Gem::Dependency
|
25
38
|
name: hpricot
|
26
|
-
|
27
|
-
|
39
|
+
prerelease: false
|
40
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
28
42
|
requirements:
|
29
43
|
- - ">="
|
30
44
|
- !ruby/object:Gem::Version
|
45
|
+
hash: 7
|
46
|
+
segments:
|
47
|
+
- 0
|
48
|
+
- 6
|
31
49
|
version: "0.6"
|
32
|
-
|
33
|
-
|
50
|
+
type: :runtime
|
51
|
+
version_requirements: *id002
|
52
|
+
- !ruby/object:Gem::Dependency
|
53
|
+
name: htmlentities
|
54
|
+
prerelease: false
|
55
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
56
|
+
none: false
|
57
|
+
requirements:
|
58
|
+
- - ">="
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
hash: 63
|
61
|
+
segments:
|
62
|
+
- 4
|
63
|
+
- 0
|
64
|
+
- 0
|
65
|
+
version: 4.0.0
|
66
|
+
type: :runtime
|
67
|
+
version_requirements: *id003
|
68
|
+
description: Website crawler and fulltext indexer.
|
34
69
|
email: jk@jkraemer.net
|
35
|
-
executables:
|
36
|
-
|
70
|
+
executables: []
|
71
|
+
|
37
72
|
extensions: []
|
38
73
|
|
39
74
|
extra_rdoc_files:
|
75
|
+
- History.txt
|
76
|
+
- Manifest.txt
|
40
77
|
- README
|
78
|
+
files:
|
41
79
|
- CHANGES
|
80
|
+
- History.txt
|
81
|
+
- install.rb
|
42
82
|
- LICENSE
|
43
|
-
-
|
44
|
-
|
83
|
+
- Manifest.txt
|
84
|
+
- rakefile
|
85
|
+
- README
|
45
86
|
- bin/rdig
|
46
|
-
-
|
47
|
-
- lib/rdig
|
48
|
-
- lib/rdig/
|
49
|
-
- lib/rdig/
|
50
|
-
- lib/rdig/crawler.rb
|
87
|
+
- doc/examples/config.rb
|
88
|
+
- lib/rdig/content_extractors/doc.rb
|
89
|
+
- lib/rdig/content_extractors/hpricot.rb
|
90
|
+
- lib/rdig/content_extractors/pdf.rb
|
51
91
|
- lib/rdig/content_extractors.rb
|
92
|
+
- lib/rdig/crawler.rb
|
93
|
+
- lib/rdig/documents.rb
|
52
94
|
- lib/rdig/file.rb
|
53
95
|
- lib/rdig/highlight.rb
|
54
|
-
- lib/rdig/
|
96
|
+
- lib/rdig/index.rb
|
55
97
|
- lib/rdig/search.rb
|
56
|
-
- lib/rdig/
|
57
|
-
- lib/rdig
|
58
|
-
- lib/rdig/content_extractors/hpricot.rb
|
59
|
-
- lib/rdig/content_extractors/pdf.rb
|
60
|
-
- lib/htmlentities
|
61
|
-
- lib/htmlentities/htmlentities.rb
|
62
|
-
- lib/htmlentities/COPYING
|
63
|
-
- lib/htmlentities/CHANGES
|
64
|
-
- lib/htmlentities/README
|
65
|
-
- test/fixtures
|
66
|
-
- test/fixtures/word
|
67
|
-
- test/fixtures/word/simple.doc
|
68
|
-
- test/fixtures/html
|
98
|
+
- lib/rdig/url_filters.rb
|
99
|
+
- lib/rdig.rb
|
69
100
|
- test/fixtures/html/custom_tag_selectors.html
|
70
|
-
- test/fixtures/html/simple.html
|
71
101
|
- test/fixtures/html/entities.html
|
72
|
-
- test/fixtures/
|
102
|
+
- test/fixtures/html/frameset.html
|
103
|
+
- test/fixtures/html/imagemap.html
|
104
|
+
- test/fixtures/html/simple.html
|
73
105
|
- test/fixtures/pdf/simple.pdf
|
74
|
-
- test/
|
106
|
+
- test/fixtures/word/simple.doc
|
107
|
+
- test/test_helper.rb
|
75
108
|
- test/unit/crawler_fs_test.rb
|
109
|
+
- test/unit/etag_filter_test.rb
|
110
|
+
- test/unit/file_document_test.rb
|
111
|
+
- test/unit/hpricot_content_extractor_test.rb
|
112
|
+
- test/unit/http_document_test.rb
|
76
113
|
- test/unit/pdf_content_extractor_test.rb
|
77
|
-
- test/unit/word_content_extractor_test.rb
|
78
114
|
- test/unit/rdig_test.rb
|
79
|
-
- test/unit/http_document_test.rb
|
80
115
|
- test/unit/searcher_test.rb
|
81
|
-
- test/unit/file_document_test.rb
|
82
116
|
- test/unit/url_filters_test.rb
|
83
|
-
- test/unit/
|
84
|
-
- test/unit/etag_filter_test.rb
|
85
|
-
- test/test_helper.rb
|
86
|
-
- doc/examples
|
87
|
-
- doc/examples/config.rb
|
88
|
-
- LICENSE
|
89
|
-
- TODO
|
90
|
-
- CHANGES
|
91
|
-
- README
|
92
|
-
- install.rb
|
93
|
-
- rakefile
|
117
|
+
- test/unit/word_content_extractor_test.rb
|
94
118
|
has_rdoc: true
|
95
|
-
homepage: http://
|
119
|
+
homepage: http://github.com/jkraemer/rdig/
|
120
|
+
licenses: []
|
121
|
+
|
96
122
|
post_install_message:
|
97
123
|
rdoc_options:
|
98
|
-
- --title
|
99
|
-
- Rake -- Ruby Make
|
100
124
|
- --main
|
101
125
|
- README
|
102
|
-
- --line-numbers
|
103
126
|
require_paths:
|
104
127
|
- lib
|
105
128
|
required_ruby_version: !ruby/object:Gem::Requirement
|
129
|
+
none: false
|
106
130
|
requirements:
|
107
131
|
- - ">="
|
108
132
|
- !ruby/object:Gem::Version
|
133
|
+
hash: 3
|
134
|
+
segments:
|
135
|
+
- 0
|
109
136
|
version: "0"
|
110
|
-
version:
|
111
137
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
138
|
+
none: false
|
112
139
|
requirements:
|
113
140
|
- - ">="
|
114
141
|
- !ruby/object:Gem::Version
|
142
|
+
hash: 3
|
143
|
+
segments:
|
144
|
+
- 0
|
115
145
|
version: "0"
|
116
|
-
version:
|
117
146
|
requirements: []
|
118
147
|
|
119
148
|
rubyforge_project: rdig
|
120
|
-
rubygems_version: 1.
|
149
|
+
rubygems_version: 1.5.3
|
121
150
|
signing_key:
|
122
|
-
specification_version:
|
123
|
-
summary:
|
124
|
-
test_files:
|
125
|
-
|
151
|
+
specification_version: 3
|
152
|
+
summary: Crawler and content extractor for building a full text index of a website's contents. Uses Ferret for indexing.
|
153
|
+
test_files:
|
154
|
+
- test/fixtures/html/custom_tag_selectors.html
|
155
|
+
- test/fixtures/html/entities.html
|
156
|
+
- test/fixtures/html/frameset.html
|
157
|
+
- test/fixtures/html/imagemap.html
|
158
|
+
- test/fixtures/html/simple.html
|
159
|
+
- test/fixtures/pdf/simple.pdf
|
160
|
+
- test/fixtures/word/simple.doc
|
161
|
+
- test/test_helper.rb
|
162
|
+
- test/unit/crawler_fs_test.rb
|
163
|
+
- test/unit/etag_filter_test.rb
|
164
|
+
- test/unit/file_document_test.rb
|
165
|
+
- test/unit/hpricot_content_extractor_test.rb
|
166
|
+
- test/unit/http_document_test.rb
|
167
|
+
- test/unit/pdf_content_extractor_test.rb
|
168
|
+
- test/unit/rdig_test.rb
|
169
|
+
- test/unit/searcher_test.rb
|
170
|
+
- test/unit/url_filters_test.rb
|
171
|
+
- test/unit/word_content_extractor_test.rb
|
data/TODO
DELETED
File without changes
|
data/lib/htmlentities/CHANGES
DELETED
@@ -1,21 +0,0 @@
|
|
1
|
-
== 2.2 (2005-11-07)
|
2
|
-
* Important bug fixes -- thanks to Moonwolf
|
3
|
-
* Decoding hexadecimal entities now accepts 'f' as a hex digit. (D'oh!)
|
4
|
-
* Decimal decoding edge cases addressed.
|
5
|
-
* Test cases added.
|
6
|
-
|
7
|
-
== 2.1 (2005-10-31)
|
8
|
-
* Removed some unnecessary code in basic entity encoding.
|
9
|
-
* Improved handling of encoding: commands are now automatically sorted, so the
|
10
|
-
user doesn't have to worry about their order.
|
11
|
-
* Now using setup.rb.
|
12
|
-
* Tests moved to separate file.
|
13
|
-
|
14
|
-
== 2.0 (2005-08-23)
|
15
|
-
* Added encoding to entities.
|
16
|
-
* Decoding interface unchanged.
|
17
|
-
* Fixed a bug with handling high codepoints.
|
18
|
-
|
19
|
-
== 1.0 (2005-08-03)
|
20
|
-
* Initial release.
|
21
|
-
* Decoding only.
|
data/lib/htmlentities/COPYING
DELETED
data/lib/htmlentities/README
DELETED
@@ -1,15 +0,0 @@
|
|
1
|
-
HTML entity encoding and decoding for Ruby
|
2
|
-
|
3
|
-
This library extends the String class to allow encoding and decoding of
|
4
|
-
HTML/XML entities from/to their corresponding UTF-8 codepoints.
|
5
|
-
|
6
|
-
To install (requires root/admin privileges):
|
7
|
-
|
8
|
-
# ruby setup.rb
|
9
|
-
|
10
|
-
To test:
|
11
|
-
|
12
|
-
$ ruby setup.rb test
|
13
|
-
|
14
|
-
Comments are welcome. Send an email to pbattley @ gmail.com.
|
15
|
-
|
@@ -1,281 +0,0 @@
|
|
1
|
-
#
|
2
|
-
# HTML entity encoding and decoding for Ruby
|
3
|
-
#
|
4
|
-
# Author:: Paul BATTLEY (pbattley @ gmail.com)
|
5
|
-
# Version:: 2.2
|
6
|
-
# Date:: 2005-11-07
|
7
|
-
#
|
8
|
-
# == About
|
9
|
-
#
|
10
|
-
# This library extends the String class to allow encoding and decoding of
|
11
|
-
# HTML/XML entities from/to their corresponding UTF-8 codepoints.
|
12
|
-
#
|
13
|
-
# == Licence
|
14
|
-
#
|
15
|
-
# Copyright (c) 2005 Paul Battley
|
16
|
-
#
|
17
|
-
# Usage of the works is permitted provided that this instrument is retained
|
18
|
-
# with the works, so that any entity that uses the works is notified of this
|
19
|
-
# instrument.
|
20
|
-
#
|
21
|
-
# DISCLAIMER: THE WORKS ARE WITHOUT WARRANTY.
|
22
|
-
#
|
23
|
-
|
24
|
-
module HTMLEntities
|
25
|
-
|
26
|
-
VERSION = '2.2'
|
27
|
-
|
28
|
-
#
|
29
|
-
# MAP is a hash of all the HTML entities I could discover, as taken
|
30
|
-
# from the w3schools page on the subject:
|
31
|
-
# http://www.w3schools.com/html/html_entitiesref.asp
|
32
|
-
# The format is 'entity name' => codepoint where entity name is given
|
33
|
-
# without the surrounding ampersand and semicolon.
|
34
|
-
#
|
35
|
-
MAP = {
|
36
|
-
'quot' => 34,
|
37
|
-
'apos' => 39,
|
38
|
-
'amp' => 38,
|
39
|
-
'lt' => 60,
|
40
|
-
'gt' => 62,
|
41
|
-
'nbsp' => 160,
|
42
|
-
'iexcl' => 161,
|
43
|
-
'curren' => 164,
|
44
|
-
'cent' => 162,
|
45
|
-
'pound' => 163,
|
46
|
-
'yen' => 165,
|
47
|
-
'brvbar' => 166,
|
48
|
-
'sect' => 167,
|
49
|
-
'uml' => 168,
|
50
|
-
'copy' => 169,
|
51
|
-
'ordf' => 170,
|
52
|
-
'laquo' => 171,
|
53
|
-
'not' => 172,
|
54
|
-
'shy' => 173,
|
55
|
-
'reg' => 174,
|
56
|
-
'trade' => 8482,
|
57
|
-
'macr' => 175,
|
58
|
-
'deg' => 176,
|
59
|
-
'plusmn' => 177,
|
60
|
-
'sup2' => 178,
|
61
|
-
'sup3' => 179,
|
62
|
-
'acute' => 180,
|
63
|
-
'micro' => 181,
|
64
|
-
'para' => 182,
|
65
|
-
'middot' => 183,
|
66
|
-
'cedil' => 184,
|
67
|
-
'sup1' => 185,
|
68
|
-
'ordm' => 186,
|
69
|
-
'raquo' => 187,
|
70
|
-
'frac14' => 188,
|
71
|
-
'frac12' => 189,
|
72
|
-
'frac34' => 190,
|
73
|
-
'iquest' => 191,
|
74
|
-
'times' => 215,
|
75
|
-
'divide' => 247,
|
76
|
-
'Agrave' => 192,
|
77
|
-
'Aacute' => 193,
|
78
|
-
'Acirc' => 194,
|
79
|
-
'Atilde' => 195,
|
80
|
-
'Auml' => 196,
|
81
|
-
'Aring' => 197,
|
82
|
-
'AElig' => 198,
|
83
|
-
'Ccedil' => 199,
|
84
|
-
'Egrave' => 200,
|
85
|
-
'Eacute' => 201,
|
86
|
-
'Ecirc' => 202,
|
87
|
-
'Euml' => 203,
|
88
|
-
'Igrave' => 204,
|
89
|
-
'Iacute' => 205,
|
90
|
-
'Icirc' => 206,
|
91
|
-
'Iuml' => 207,
|
92
|
-
'ETH' => 208,
|
93
|
-
'Ntilde' => 209,
|
94
|
-
'Ograve' => 210,
|
95
|
-
'Oacute' => 211,
|
96
|
-
'Ocirc' => 212,
|
97
|
-
'Otilde' => 213,
|
98
|
-
'Ouml' => 214,
|
99
|
-
'Oslash' => 216,
|
100
|
-
'Ugrave' => 217,
|
101
|
-
'Uacute' => 218,
|
102
|
-
'Ucirc' => 219,
|
103
|
-
'Uuml' => 220,
|
104
|
-
'Yacute' => 221,
|
105
|
-
'THORN' => 222,
|
106
|
-
'szlig' => 223,
|
107
|
-
'agrave' => 224,
|
108
|
-
'aacute' => 225,
|
109
|
-
'acirc' => 226,
|
110
|
-
'atilde' => 227,
|
111
|
-
'auml' => 228,
|
112
|
-
'aring' => 229,
|
113
|
-
'aelig' => 230,
|
114
|
-
'ccedil' => 231,
|
115
|
-
'egrave' => 232,
|
116
|
-
'eacute' => 233,
|
117
|
-
'ecirc' => 234,
|
118
|
-
'euml' => 235,
|
119
|
-
'igrave' => 236,
|
120
|
-
'iacute' => 237,
|
121
|
-
'icirc' => 238,
|
122
|
-
'iuml' => 239,
|
123
|
-
'eth' => 240,
|
124
|
-
'ntilde' => 241,
|
125
|
-
'ograve' => 242,
|
126
|
-
'oacute' => 243,
|
127
|
-
'ocirc' => 244,
|
128
|
-
'otilde' => 245,
|
129
|
-
'ouml' => 246,
|
130
|
-
'oslash' => 248,
|
131
|
-
'ugrave' => 249,
|
132
|
-
'uacute' => 250,
|
133
|
-
'ucirc' => 251,
|
134
|
-
'uuml' => 252,
|
135
|
-
'yacute' => 253,
|
136
|
-
'thorn' => 254,
|
137
|
-
'yuml' => 255,
|
138
|
-
'OElig' => 338,
|
139
|
-
'oelig' => 339,
|
140
|
-
'Scaron' => 352,
|
141
|
-
'scaron' => 353,
|
142
|
-
'Yuml' => 376,
|
143
|
-
'circ' => 710,
|
144
|
-
'tilde' => 732,
|
145
|
-
'ensp' => 8194,
|
146
|
-
'emsp' => 8195,
|
147
|
-
'thinsp' => 8201,
|
148
|
-
'zwnj' => 8204,
|
149
|
-
'zwj' => 8205,
|
150
|
-
'lrm' => 8206,
|
151
|
-
'rlm' => 8207,
|
152
|
-
'ndash' => 8211,
|
153
|
-
'mdash' => 8212,
|
154
|
-
'lsquo' => 8216,
|
155
|
-
'rsquo' => 8217,
|
156
|
-
'sbquo' => 8218,
|
157
|
-
'ldquo' => 8220,
|
158
|
-
'rdquo' => 8221,
|
159
|
-
'bdquo' => 8222,
|
160
|
-
'dagger' => 8224,
|
161
|
-
'Dagger' => 8225,
|
162
|
-
'hellip' => 8230,
|
163
|
-
'permil' => 8240,
|
164
|
-
'lsaquo' => 8249,
|
165
|
-
'rsaquo' => 8250,
|
166
|
-
'euro' => 8364
|
167
|
-
}
|
168
|
-
|
169
|
-
MIN_LENGTH = MAP.keys.map{ |a| a.length }.min
|
170
|
-
MAX_LENGTH = MAP.keys.map{ |a| a.length }.max
|
171
|
-
|
172
|
-
# Precompile the regexp
|
173
|
-
NAMED_ENTITY_REGEXP =
|
174
|
-
/&([a-z]{#{HTMLEntities::MIN_LENGTH},#{HTMLEntities::MAX_LENGTH}});/i
|
175
|
-
|
176
|
-
# Reverse map for converting characters to named entities
|
177
|
-
REVERSE_MAP = MAP.invert
|
178
|
-
|
179
|
-
BASIC_ENTITY_REGEXP = /[<>'"&]/
|
180
|
-
|
181
|
-
UTF8_NON_ASCII_REGEXP = /[\x00-\x1f]|[\xc0-\xfd][\x80-\xbf]+/
|
182
|
-
|
183
|
-
end
|
184
|
-
|
185
|
-
class String
|
186
|
-
|
187
|
-
# Because there's no need to make the user worry about the order here,
|
188
|
-
# let's handle it.
|
189
|
-
ENCODE_ENTITIES_COMMAND_ORDER = {
|
190
|
-
:basic => 0,
|
191
|
-
:named => 1,
|
192
|
-
:decimal => 2,
|
193
|
-
:hexadecimal => 3
|
194
|
-
}
|
195
|
-
|
196
|
-
#
|
197
|
-
# Decode XML and HTML 4.01 entities in a string into their UTF-8
|
198
|
-
# equivalents. Obviously, if your string is not already in UTF-8, you'd
|
199
|
-
# better convert it before using this method, or the output will be mixed
|
200
|
-
# up.
|
201
|
-
# Unknown named entities are not converted
|
202
|
-
#
|
203
|
-
def decode_entities
|
204
|
-
return gsub(HTMLEntities::NAMED_ENTITY_REGEXP) {
|
205
|
-
HTMLEntities::MAP.has_key?($1) ? [HTMLEntities::MAP[$1]].pack('U') : $&
|
206
|
-
}.gsub(/&#([0-9]{1,7});/) {
|
207
|
-
[$1.to_i].pack('U')
|
208
|
-
}.gsub(/&#x([0-9a-f]{1,6});/i) {
|
209
|
-
[$1.to_i(16)].pack('U')
|
210
|
-
}
|
211
|
-
end
|
212
|
-
|
213
|
-
#
|
214
|
-
# Encode codepoints into their corresponding entities. Various operations
|
215
|
-
# are possible, and may be specified in order:
|
216
|
-
#
|
217
|
-
# :basic :: Convert the five XML entities ('"<>&)
|
218
|
-
# :named :: Convert non-ASCII characters to their named HTML 4.01 equivalent
|
219
|
-
# :decimal :: Convert non-ASCII characters to decimal entities (e.g. Ӓ)
|
220
|
-
# :hexadecimal :: Convert non-ASCII characters to hexadecimal entities (e.g. # ካ)
|
221
|
-
#
|
222
|
-
# You can specify the commands in any order, but they will be executed in
|
223
|
-
# the order listed above to ensure that entity ampersands are not
|
224
|
-
# clobbered and that named entities are replaced before numeric ones.
|
225
|
-
#
|
226
|
-
# If no instructions are specified, :basic will be used.
|
227
|
-
#
|
228
|
-
# Examples:
|
229
|
-
# str.encode_entities - XML-safe
|
230
|
-
# str.encode_entities(:basic, :decimal) - XML-safe and 7-bit clean
|
231
|
-
# str.encode_entities(:basic, :named, :decimal) - 7-bit clean, with all
|
232
|
-
# non-ASCII characters replaced with their named entity where possible, and
|
233
|
-
# decimal equivalents otherwise.
|
234
|
-
#
|
235
|
-
# Note: It is the program's responsibility to ensure that the string
|
236
|
-
# contains valid UTF-8 before calling this method.
|
237
|
-
#
|
238
|
-
def encode_entities(*instructions)
|
239
|
-
str = nil
|
240
|
-
if (instructions.empty?)
|
241
|
-
instructions = [:basic]
|
242
|
-
else
|
243
|
-
instructions.each do |instr|
|
244
|
-
unless ENCODE_ENTITIES_COMMAND_ORDER[instr]
|
245
|
-
raise RuntimeError, "unknown encode_entities command `#{instr.inspect}'"
|
246
|
-
end
|
247
|
-
end
|
248
|
-
instructions.sort! { |a,b|
|
249
|
-
ENCODE_ENTITIES_COMMAND_ORDER[a] <=>
|
250
|
-
ENCODE_ENTITIES_COMMAND_ORDER[b]
|
251
|
-
}
|
252
|
-
end
|
253
|
-
instructions.each do |instruction|
|
254
|
-
case instruction
|
255
|
-
when :basic
|
256
|
-
# Handled as basic ASCII
|
257
|
-
str = (str || self).gsub(HTMLEntities::BASIC_ENTITY_REGEXP) {
|
258
|
-
# It's safe to use the simpler [0] here because we know
|
259
|
-
# that the basic entities are ASCII.
|
260
|
-
'&' << HTMLEntities::REVERSE_MAP[$&[0]] << ';'
|
261
|
-
}
|
262
|
-
when :named
|
263
|
-
# Test everything except printable ASCII
|
264
|
-
str = (str || self).gsub(HTMLEntities::UTF8_NON_ASCII_REGEXP) {
|
265
|
-
cp = $&.unpack('U')[0]
|
266
|
-
(e = HTMLEntities::REVERSE_MAP[cp]) ? "&#{e};" : $&
|
267
|
-
}
|
268
|
-
when :decimal
|
269
|
-
str = (str || self).gsub(HTMLEntities::UTF8_NON_ASCII_REGEXP) {
|
270
|
-
"&##{$&.unpack('U')[0]};"
|
271
|
-
}
|
272
|
-
when :hexadecimal
|
273
|
-
str = (str || self).gsub(HTMLEntities::UTF8_NON_ASCII_REGEXP) {
|
274
|
-
"&#x#{$&.unpack('U')[0].to_s(16)};"
|
275
|
-
}
|
276
|
-
end
|
277
|
-
end
|
278
|
-
return str
|
279
|
-
end
|
280
|
-
|
281
|
-
end
|