rdig 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGES +2 -0
- data/LICENSE +20 -0
- data/README +61 -0
- data/TODO +0 -0
- data/bin/rdig +32 -0
- data/doc/examples/config.rb +53 -0
- data/install.rb +89 -0
- data/lib/htmlentities/CHANGES +21 -0
- data/lib/htmlentities/COPYING +7 -0
- data/lib/htmlentities/README +15 -0
- data/lib/htmlentities/htmlentities.rb +281 -0
- data/lib/rdig.rb +243 -0
- data/lib/rdig/content_extractors.rb +145 -0
- data/lib/rdig/crawler.rb +176 -0
- data/lib/rdig/highlight.rb +24 -0
- data/lib/rdig/http_client.rb +22 -0
- data/lib/rdig/index.rb +39 -0
- data/lib/rdig/search.rb +77 -0
- data/lib/rdig/url_filters.rb +171 -0
- data/rakefile +325 -0
- data/test/fixtures/html/custom_tag_selectors.html +25 -0
- data/test/fixtures/html/entities.html +15 -0
- data/test/fixtures/html/simple.html +17 -0
- data/test/test_helper.rb +18 -0
- data/test/unit/etag_filter_test.rb +23 -0
- data/test/unit/html_content_extractor_test.rb +64 -0
- data/test/unit/url_filters_test.rb +96 -0
- metadata +102 -0
data/CHANGES
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2006 Jens Kraemer
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README
ADDED
@@ -0,0 +1,61 @@
|
|
1
|
+
= RDig
|
2
|
+
|
3
|
+
RDig provides an HTTP crawler and content extraction utilities
|
4
|
+
to help building a site search for web sites or intranets. Internally,
|
5
|
+
Ferret is used for the full text indexing. After creating a config file
|
6
|
+
for your site, the index can be built with a single call to rdig.
|
7
|
+
|
8
|
+
RDig depends on Ferret (>= 0.3.2) and the RubyfulSoup library (>= 1.0.4).
|
9
|
+
|
10
|
+
== basic usage
|
11
|
+
|
12
|
+
|
13
|
+
=== Index creation
|
14
|
+
- create a config file based on the template in doc/examples
|
15
|
+
- to create an index:
|
16
|
+
rdig -c CONFIGFILE
|
17
|
+
- to run a query against the index (just to try it out)
|
18
|
+
rdig -c CONFIGFILE -q 'your query'
|
19
|
+
this will dump the first 10 search results to STDOUT
|
20
|
+
|
21
|
+
=== Handle search in your application:
|
22
|
+
require 'rdig'
|
23
|
+
require 'rdig_config' # load your config file here
|
24
|
+
search_results = RDig.searcher.search(query, options={})
|
25
|
+
|
26
|
+
see RDig::Search::Searcher for more information.
|
27
|
+
|
28
|
+
|
29
|
+
== usage in rails
|
30
|
+
|
31
|
+
- add to config/environment.rb :
|
32
|
+
require 'rdig'
|
33
|
+
require 'rdig_config'
|
34
|
+
- place rdig_config.rb into config/ directory.
|
35
|
+
- build index:
|
36
|
+
rdig -c config/rdig_config.rb
|
37
|
+
- in your controller that handles the search form:
|
38
|
+
search_results = RDig.searcher.search(params[:query])
|
39
|
+
@results = search_results[:list]
|
40
|
+
@hitcount = search_results[:hitcount]
|
41
|
+
|
42
|
+
=== search result paging
|
43
|
+
Use the :first_doc and :num_docs options to implement
|
44
|
+
paging through search results.
|
45
|
+
(:num_docs is 10 by default, so without using these options only the first 10
|
46
|
+
results will be retrieved)
|
47
|
+
|
48
|
+
|
49
|
+
== sample configuration
|
50
|
+
|
51
|
+
from doc/examples/config.rb. The tag_selector properties are called
|
52
|
+
with a BeautifulSoup instance as parameter. See the RubyfulSoup Site[http://www.crummy.com/software/RubyfulSoup/documentation.html] for more info about this cool lib.
|
53
|
+
You can also have a look at the +html_content_extractor+ unit test.
|
54
|
+
|
55
|
+
See [] for API documentation of the
|
56
|
+
Rubyful Soup lib used
|
57
|
+
|
58
|
+
:include:doc/examples/config.rb
|
59
|
+
|
60
|
+
|
61
|
+
|
data/TODO
ADDED
File without changes
|
data/bin/rdig
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# run from RAILS_ROOT with
|
4
|
+
# ruby -Ilib vendor/plugins/sitesearch/create_index.rb config
|
5
|
+
# where config is the name of your config file
|
6
|
+
|
7
|
+
begin
|
8
|
+
require 'rdig'
|
9
|
+
rescue LoadError
|
10
|
+
require 'rubygems'
|
11
|
+
require 'rdig'
|
12
|
+
end
|
13
|
+
RDig.application.run
|
14
|
+
|
15
|
+
|
16
|
+
#$LOAD_PATH << File.expand_path(File.dirname(__FILE__) + "/lib")
|
17
|
+
#$LOAD_PATH << File.expand_path(File.dirname(__FILE__))
|
18
|
+
#require 'init'
|
19
|
+
|
20
|
+
#if ARGV[0]
|
21
|
+
# require ARGV[0]
|
22
|
+
#else
|
23
|
+
# require 'config'
|
24
|
+
#end
|
25
|
+
|
26
|
+
#include SiteSearch
|
27
|
+
|
28
|
+
|
29
|
+
#puts "creating new index in #{SiteSearch.settings[:index_dir]}"
|
30
|
+
|
31
|
+
#crawler = Crawler.new
|
32
|
+
#crawler.run
|
@@ -0,0 +1,53 @@
|
|
1
|
+
RDig.configuration do |cfg|
|
2
|
+
|
3
|
+
##################################################################
|
4
|
+
# options you should really set
|
5
|
+
|
6
|
+
# provide one or more URLs for the crawler to start from
|
7
|
+
cfg.crawler.start_urls = [ 'http://www.example.com/' ]
|
8
|
+
|
9
|
+
# limit the crawl to these hosts. The crawler will never
|
10
|
+
# follow any links pointing to hosts other than those given here.
|
11
|
+
cfg.crawler.include_hosts = [ 'www.example.com' ]
|
12
|
+
|
13
|
+
# this is the path where the index will be stored
|
14
|
+
# caution, existing contents of this directory will be deleted!
|
15
|
+
cfg.ferret.path = '/path/to/index'
|
16
|
+
|
17
|
+
##################################################################
|
18
|
+
# options you might want to set, the given values are the defaults
|
19
|
+
|
20
|
+
# content extraction options
|
21
|
+
|
22
|
+
# provide a method that selects the tag containing the title of a document
|
23
|
+
# cfg.content_extraction.html.title_tag_selector = lambda { |tagsoup| tagsoup.html.head.title }
|
24
|
+
|
25
|
+
# provide a method that selects the tag containing the page content you
|
26
|
+
# want to index. Useful to avoid indexing common elements like navigation
|
27
|
+
# and page footers for every page.
|
28
|
+
# cfg.content_extraction.html.content_tag_selector = lambda { |tagsoup| tagsoup.html.body }
|
29
|
+
|
30
|
+
# crawler options
|
31
|
+
|
32
|
+
# nil (index all documents) or an array of Regexps
|
33
|
+
# matching URLs you want to index.
|
34
|
+
# cfg.crawler.include_documents = nil
|
35
|
+
|
36
|
+
# nil (no documents excluded) or an array of Regexps
|
37
|
+
# matching URLs not to index.
|
38
|
+
# this filter is used after the one above, so you only need
|
39
|
+
# to exclude documents here that aren't wanted but would be
|
40
|
+
# included by the inclusion patterns.
|
41
|
+
# cfg.crawler.exclude_documents = nil
|
42
|
+
|
43
|
+
# number of http fetching threads to use
|
44
|
+
# cfg.crawler.num_threads = 2
|
45
|
+
|
46
|
+
# maximum number of http redirections to follow
|
47
|
+
# cfg.crawler.max_redirects = 5
|
48
|
+
|
49
|
+
# number of seconds to wait with an empty url queue before
|
50
|
+
# finishing the crawl. Set to a higher number for slow sites
|
51
|
+
# cfg.crawler.wait_before_leave = 10
|
52
|
+
|
53
|
+
end
|
data/install.rb
ADDED
@@ -0,0 +1,89 @@
|
|
1
|
+
require 'rbconfig'
|
2
|
+
require 'find'
|
3
|
+
require 'ftools'
|
4
|
+
|
5
|
+
include Config
|
6
|
+
|
7
|
+
$ruby = CONFIG['ruby_install_name']
|
8
|
+
|
9
|
+
##
|
10
|
+
# Install a binary file. We patch in on the way through to
|
11
|
+
# insert a #! line. If this is a Unix install, we name
|
12
|
+
# the command (for example) 'rdig' and let the shebang line
|
13
|
+
# handle running it. Under windows, we add a '.rb' extension
|
14
|
+
# and let file associations to their stuff
|
15
|
+
#
|
16
|
+
# based on install.rb from the Rake distribution
|
17
|
+
|
18
|
+
def installBIN(from, opfile)
|
19
|
+
|
20
|
+
tmp_dir = nil
|
21
|
+
for t in [".", "/tmp", "c:/temp", $bindir]
|
22
|
+
stat = File.stat(t) rescue next
|
23
|
+
if stat.directory? and stat.writable?
|
24
|
+
tmp_dir = t
|
25
|
+
break
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
fail "Cannot find a temporary directory" unless tmp_dir
|
30
|
+
tmp_file = File.join(tmp_dir, "_tmp")
|
31
|
+
|
32
|
+
File.open(from) do |ip|
|
33
|
+
File.open(tmp_file, "w") do |op|
|
34
|
+
ruby = File.join($realbindir, $ruby)
|
35
|
+
op.puts "#!#{ruby} -w"
|
36
|
+
op.write ip.read
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
opfile += ".rb" if CONFIG["target_os"] =~ /mswin/i
|
41
|
+
File::install(tmp_file, File.join($bindir, opfile), 0755, true)
|
42
|
+
File::unlink(tmp_file)
|
43
|
+
end
|
44
|
+
|
45
|
+
$sitedir = CONFIG["sitelibdir"]
|
46
|
+
unless $sitedir
|
47
|
+
version = CONFIG["MAJOR"]+"."+CONFIG["MINOR"]
|
48
|
+
$libdir = File.join(CONFIG["libdir"], "ruby", version)
|
49
|
+
$sitedir = $:.find {|x| x =~ /site_ruby/}
|
50
|
+
if !$sitedir
|
51
|
+
$sitedir = File.join($libdir, "site_ruby")
|
52
|
+
elsif $sitedir !~ Regexp.quote(version)
|
53
|
+
$sitedir = File.join($sitedir, version)
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
$bindir = CONFIG["bindir"]
|
58
|
+
|
59
|
+
$realbindir = $bindir
|
60
|
+
|
61
|
+
bindir = CONFIG["bindir"]
|
62
|
+
if (destdir = ENV['DESTDIR'])
|
63
|
+
$bindir = destdir + $bindir
|
64
|
+
$sitedir = destdir + $sitedir
|
65
|
+
|
66
|
+
File::makedirs($bindir)
|
67
|
+
File::makedirs($sitedir)
|
68
|
+
end
|
69
|
+
|
70
|
+
rdig_dest = File.join($sitedir, "rdig")
|
71
|
+
File::makedirs(rdig_dest, true)
|
72
|
+
File::chmod(0755, rdig_dest)
|
73
|
+
|
74
|
+
# The library files
|
75
|
+
|
76
|
+
files = Dir.chdir('lib') { Dir['**/*.rb'] }
|
77
|
+
|
78
|
+
for fn in files
|
79
|
+
fn_dir = File.dirname(fn)
|
80
|
+
target_dir = File.join($sitedir, fn_dir)
|
81
|
+
if ! File.exist?(target_dir)
|
82
|
+
File.makedirs(target_dir)
|
83
|
+
end
|
84
|
+
File::install(File.join('lib', fn), File.join($sitedir, fn), 0644, true)
|
85
|
+
end
|
86
|
+
|
87
|
+
# and the executable
|
88
|
+
|
89
|
+
installBIN("bin/rdig", "rdig")
|
@@ -0,0 +1,21 @@
|
|
1
|
+
== 2.2 (2005-11-07)
|
2
|
+
* Important bug fixes -- thanks to Moonwolf
|
3
|
+
* Decoding hexadecimal entities now accepts 'f' as a hex digit. (D'oh!)
|
4
|
+
* Decimal decoding edge cases addressed.
|
5
|
+
* Test cases added.
|
6
|
+
|
7
|
+
== 2.1 (2005-10-31)
|
8
|
+
* Removed some unnecessary code in basic entity encoding.
|
9
|
+
* Improved handling of encoding: commands are now automatically sorted, so the
|
10
|
+
user doesn't have to worry about their order.
|
11
|
+
* Now using setup.rb.
|
12
|
+
* Tests moved to separate file.
|
13
|
+
|
14
|
+
== 2.0 (2005-08-23)
|
15
|
+
* Added encoding to entities.
|
16
|
+
* Decoding interface unchanged.
|
17
|
+
* Fixed a bug with handling high codepoints.
|
18
|
+
|
19
|
+
== 1.0 (2005-08-03)
|
20
|
+
* Initial release.
|
21
|
+
* Decoding only.
|
@@ -0,0 +1,15 @@
|
|
1
|
+
HTML entity encoding and decoding for Ruby
|
2
|
+
|
3
|
+
This library extends the String class to allow encoding and decoding of
|
4
|
+
HTML/XML entities from/to their corresponding UTF-8 codepoints.
|
5
|
+
|
6
|
+
To install (requires root/admin privileges):
|
7
|
+
|
8
|
+
# ruby setup.rb
|
9
|
+
|
10
|
+
To test:
|
11
|
+
|
12
|
+
$ ruby setup.rb test
|
13
|
+
|
14
|
+
Comments are welcome. Send an email to pbattley @ gmail.com.
|
15
|
+
|
@@ -0,0 +1,281 @@
|
|
1
|
+
#
|
2
|
+
# HTML entity encoding and decoding for Ruby
|
3
|
+
#
|
4
|
+
# Author:: Paul BATTLEY (pbattley @ gmail.com)
|
5
|
+
# Version:: 2.2
|
6
|
+
# Date:: 2005-11-07
|
7
|
+
#
|
8
|
+
# == About
|
9
|
+
#
|
10
|
+
# This library extends the String class to allow encoding and decoding of
|
11
|
+
# HTML/XML entities from/to their corresponding UTF-8 codepoints.
|
12
|
+
#
|
13
|
+
# == Licence
|
14
|
+
#
|
15
|
+
# Copyright (c) 2005 Paul Battley
|
16
|
+
#
|
17
|
+
# Usage of the works is permitted provided that this instrument is retained
|
18
|
+
# with the works, so that any entity that uses the works is notified of this
|
19
|
+
# instrument.
|
20
|
+
#
|
21
|
+
# DISCLAIMER: THE WORKS ARE WITHOUT WARRANTY.
|
22
|
+
#
|
23
|
+
|
24
|
+
module HTMLEntities
|
25
|
+
|
26
|
+
VERSION = '2.2'
|
27
|
+
|
28
|
+
#
|
29
|
+
# MAP is a hash of all the HTML entities I could discover, as taken
|
30
|
+
# from the w3schools page on the subject:
|
31
|
+
# http://www.w3schools.com/html/html_entitiesref.asp
|
32
|
+
# The format is 'entity name' => codepoint where entity name is given
|
33
|
+
# without the surrounding ampersand and semicolon.
|
34
|
+
#
|
35
|
+
MAP = {
|
36
|
+
'quot' => 34,
|
37
|
+
'apos' => 39,
|
38
|
+
'amp' => 38,
|
39
|
+
'lt' => 60,
|
40
|
+
'gt' => 62,
|
41
|
+
'nbsp' => 160,
|
42
|
+
'iexcl' => 161,
|
43
|
+
'curren' => 164,
|
44
|
+
'cent' => 162,
|
45
|
+
'pound' => 163,
|
46
|
+
'yen' => 165,
|
47
|
+
'brvbar' => 166,
|
48
|
+
'sect' => 167,
|
49
|
+
'uml' => 168,
|
50
|
+
'copy' => 169,
|
51
|
+
'ordf' => 170,
|
52
|
+
'laquo' => 171,
|
53
|
+
'not' => 172,
|
54
|
+
'shy' => 173,
|
55
|
+
'reg' => 174,
|
56
|
+
'trade' => 8482,
|
57
|
+
'macr' => 175,
|
58
|
+
'deg' => 176,
|
59
|
+
'plusmn' => 177,
|
60
|
+
'sup2' => 178,
|
61
|
+
'sup3' => 179,
|
62
|
+
'acute' => 180,
|
63
|
+
'micro' => 181,
|
64
|
+
'para' => 182,
|
65
|
+
'middot' => 183,
|
66
|
+
'cedil' => 184,
|
67
|
+
'sup1' => 185,
|
68
|
+
'ordm' => 186,
|
69
|
+
'raquo' => 187,
|
70
|
+
'frac14' => 188,
|
71
|
+
'frac12' => 189,
|
72
|
+
'frac34' => 190,
|
73
|
+
'iquest' => 191,
|
74
|
+
'times' => 215,
|
75
|
+
'divide' => 247,
|
76
|
+
'Agrave' => 192,
|
77
|
+
'Aacute' => 193,
|
78
|
+
'Acirc' => 194,
|
79
|
+
'Atilde' => 195,
|
80
|
+
'Auml' => 196,
|
81
|
+
'Aring' => 197,
|
82
|
+
'AElig' => 198,
|
83
|
+
'Ccedil' => 199,
|
84
|
+
'Egrave' => 200,
|
85
|
+
'Eacute' => 201,
|
86
|
+
'Ecirc' => 202,
|
87
|
+
'Euml' => 203,
|
88
|
+
'Igrave' => 204,
|
89
|
+
'Iacute' => 205,
|
90
|
+
'Icirc' => 206,
|
91
|
+
'Iuml' => 207,
|
92
|
+
'ETH' => 208,
|
93
|
+
'Ntilde' => 209,
|
94
|
+
'Ograve' => 210,
|
95
|
+
'Oacute' => 211,
|
96
|
+
'Ocirc' => 212,
|
97
|
+
'Otilde' => 213,
|
98
|
+
'Ouml' => 214,
|
99
|
+
'Oslash' => 216,
|
100
|
+
'Ugrave' => 217,
|
101
|
+
'Uacute' => 218,
|
102
|
+
'Ucirc' => 219,
|
103
|
+
'Uuml' => 220,
|
104
|
+
'Yacute' => 221,
|
105
|
+
'THORN' => 222,
|
106
|
+
'szlig' => 223,
|
107
|
+
'agrave' => 224,
|
108
|
+
'aacute' => 225,
|
109
|
+
'acirc' => 226,
|
110
|
+
'atilde' => 227,
|
111
|
+
'auml' => 228,
|
112
|
+
'aring' => 229,
|
113
|
+
'aelig' => 230,
|
114
|
+
'ccedil' => 231,
|
115
|
+
'egrave' => 232,
|
116
|
+
'eacute' => 233,
|
117
|
+
'ecirc' => 234,
|
118
|
+
'euml' => 235,
|
119
|
+
'igrave' => 236,
|
120
|
+
'iacute' => 237,
|
121
|
+
'icirc' => 238,
|
122
|
+
'iuml' => 239,
|
123
|
+
'eth' => 240,
|
124
|
+
'ntilde' => 241,
|
125
|
+
'ograve' => 242,
|
126
|
+
'oacute' => 243,
|
127
|
+
'ocirc' => 244,
|
128
|
+
'otilde' => 245,
|
129
|
+
'ouml' => 246,
|
130
|
+
'oslash' => 248,
|
131
|
+
'ugrave' => 249,
|
132
|
+
'uacute' => 250,
|
133
|
+
'ucirc' => 251,
|
134
|
+
'uuml' => 252,
|
135
|
+
'yacute' => 253,
|
136
|
+
'thorn' => 254,
|
137
|
+
'yuml' => 255,
|
138
|
+
'OElig' => 338,
|
139
|
+
'oelig' => 339,
|
140
|
+
'Scaron' => 352,
|
141
|
+
'scaron' => 353,
|
142
|
+
'Yuml' => 376,
|
143
|
+
'circ' => 710,
|
144
|
+
'tilde' => 732,
|
145
|
+
'ensp' => 8194,
|
146
|
+
'emsp' => 8195,
|
147
|
+
'thinsp' => 8201,
|
148
|
+
'zwnj' => 8204,
|
149
|
+
'zwj' => 8205,
|
150
|
+
'lrm' => 8206,
|
151
|
+
'rlm' => 8207,
|
152
|
+
'ndash' => 8211,
|
153
|
+
'mdash' => 8212,
|
154
|
+
'lsquo' => 8216,
|
155
|
+
'rsquo' => 8217,
|
156
|
+
'sbquo' => 8218,
|
157
|
+
'ldquo' => 8220,
|
158
|
+
'rdquo' => 8221,
|
159
|
+
'bdquo' => 8222,
|
160
|
+
'dagger' => 8224,
|
161
|
+
'Dagger' => 8225,
|
162
|
+
'hellip' => 8230,
|
163
|
+
'permil' => 8240,
|
164
|
+
'lsaquo' => 8249,
|
165
|
+
'rsaquo' => 8250,
|
166
|
+
'euro' => 8364
|
167
|
+
}
|
168
|
+
|
169
|
+
MIN_LENGTH = MAP.keys.map{ |a| a.length }.min
|
170
|
+
MAX_LENGTH = MAP.keys.map{ |a| a.length }.max
|
171
|
+
|
172
|
+
# Precompile the regexp
|
173
|
+
NAMED_ENTITY_REGEXP =
|
174
|
+
/&([a-z]{#{HTMLEntities::MIN_LENGTH},#{HTMLEntities::MAX_LENGTH}});/i
|
175
|
+
|
176
|
+
# Reverse map for converting characters to named entities
|
177
|
+
REVERSE_MAP = MAP.invert
|
178
|
+
|
179
|
+
BASIC_ENTITY_REGEXP = /[<>'"&]/
|
180
|
+
|
181
|
+
UTF8_NON_ASCII_REGEXP = /[\x00-\x1f]|[\xc0-\xfd][\x80-\xbf]+/
|
182
|
+
|
183
|
+
end
|
184
|
+
|
185
|
+
class String
|
186
|
+
|
187
|
+
# Because there's no need to make the user worry about the order here,
|
188
|
+
# let's handle it.
|
189
|
+
ENCODE_ENTITIES_COMMAND_ORDER = {
|
190
|
+
:basic => 0,
|
191
|
+
:named => 1,
|
192
|
+
:decimal => 2,
|
193
|
+
:hexadecimal => 3
|
194
|
+
}
|
195
|
+
|
196
|
+
#
|
197
|
+
# Decode XML and HTML 4.01 entities in a string into their UTF-8
|
198
|
+
# equivalents. Obviously, if your string is not already in UTF-8, you'd
|
199
|
+
# better convert it before using this method, or the output will be mixed
|
200
|
+
# up.
|
201
|
+
# Unknown named entities are not converted
|
202
|
+
#
|
203
|
+
def decode_entities
|
204
|
+
return gsub(HTMLEntities::NAMED_ENTITY_REGEXP) {
|
205
|
+
HTMLEntities::MAP.has_key?($1) ? [HTMLEntities::MAP[$1]].pack('U') : $&
|
206
|
+
}.gsub(/&#([0-9]{1,7});/) {
|
207
|
+
[$1.to_i].pack('U')
|
208
|
+
}.gsub(/&#x([0-9a-f]{1,6});/i) {
|
209
|
+
[$1.to_i(16)].pack('U')
|
210
|
+
}
|
211
|
+
end
|
212
|
+
|
213
|
+
#
|
214
|
+
# Encode codepoints into their corresponding entities. Various operations
|
215
|
+
# are possible, and may be specified in order:
|
216
|
+
#
|
217
|
+
# :basic :: Convert the five XML entities ('"<>&)
|
218
|
+
# :named :: Convert non-ASCII characters to their named HTML 4.01 equivalent
|
219
|
+
# :decimal :: Convert non-ASCII characters to decimal entities (e.g. Ӓ)
|
220
|
+
# :hexadecimal :: Convert non-ASCII characters to hexadecimal entities (e.g. # ካ)
|
221
|
+
#
|
222
|
+
# You can specify the commands in any order, but they will be executed in
|
223
|
+
# the order listed above to ensure that entity ampersands are not
|
224
|
+
# clobbered and that named entities are replaced before numeric ones.
|
225
|
+
#
|
226
|
+
# If no instructions are specified, :basic will be used.
|
227
|
+
#
|
228
|
+
# Examples:
|
229
|
+
# str.encode_entities - XML-safe
|
230
|
+
# str.encode_entities(:basic, :decimal) - XML-safe and 7-bit clean
|
231
|
+
# str.encode_entities(:basic, :named, :decimal) - 7-bit clean, with all
|
232
|
+
# non-ASCII characters replaced with their named entity where possible, and
|
233
|
+
# decimal equivalents otherwise.
|
234
|
+
#
|
235
|
+
# Note: It is the program's responsibility to ensure that the string
|
236
|
+
# contains valid UTF-8 before calling this method.
|
237
|
+
#
|
238
|
+
def encode_entities(*instructions)
|
239
|
+
str = nil
|
240
|
+
if (instructions.empty?)
|
241
|
+
instructions = [:basic]
|
242
|
+
else
|
243
|
+
instructions.each do |instr|
|
244
|
+
unless ENCODE_ENTITIES_COMMAND_ORDER[instr]
|
245
|
+
raise RuntimeError, "unknown encode_entities command `#{instr.inspect}'"
|
246
|
+
end
|
247
|
+
end
|
248
|
+
instructions.sort! { |a,b|
|
249
|
+
ENCODE_ENTITIES_COMMAND_ORDER[a] <=>
|
250
|
+
ENCODE_ENTITIES_COMMAND_ORDER[b]
|
251
|
+
}
|
252
|
+
end
|
253
|
+
instructions.each do |instruction|
|
254
|
+
case instruction
|
255
|
+
when :basic
|
256
|
+
# Handled as basic ASCII
|
257
|
+
str = (str || self).gsub(HTMLEntities::BASIC_ENTITY_REGEXP) {
|
258
|
+
# It's safe to use the simpler [0] here because we know
|
259
|
+
# that the basic entities are ASCII.
|
260
|
+
'&' << HTMLEntities::REVERSE_MAP[$&[0]] << ';'
|
261
|
+
}
|
262
|
+
when :named
|
263
|
+
# Test everything except printable ASCII
|
264
|
+
str = (str || self).gsub(HTMLEntities::UTF8_NON_ASCII_REGEXP) {
|
265
|
+
cp = $&.unpack('U')[0]
|
266
|
+
(e = HTMLEntities::REVERSE_MAP[cp]) ? "&#{e};" : $&
|
267
|
+
}
|
268
|
+
when :decimal
|
269
|
+
str = (str || self).gsub(HTMLEntities::UTF8_NON_ASCII_REGEXP) {
|
270
|
+
"&##{$&.unpack('U')[0]};"
|
271
|
+
}
|
272
|
+
when :hexadecimal
|
273
|
+
str = (str || self).gsub(HTMLEntities::UTF8_NON_ASCII_REGEXP) {
|
274
|
+
"&#x#{$&.unpack('U')[0].to_s(16)};"
|
275
|
+
}
|
276
|
+
end
|
277
|
+
end
|
278
|
+
return str
|
279
|
+
end
|
280
|
+
|
281
|
+
end
|