webtranslateit-hpricot 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +15 -0
  3. data/CHANGELOG +122 -0
  4. data/COPYING +18 -0
  5. data/README.md +295 -0
  6. data/Rakefile +237 -0
  7. data/ext/fast_xs/FastXsService.java +1123 -0
  8. data/ext/fast_xs/extconf.rb +4 -0
  9. data/ext/fast_xs/fast_xs.c +210 -0
  10. data/ext/hpricot_scan/HpricotCss.java +850 -0
  11. data/ext/hpricot_scan/HpricotScanService.java +2085 -0
  12. data/ext/hpricot_scan/MANIFEST +0 -0
  13. data/ext/hpricot_scan/extconf.rb +9 -0
  14. data/ext/hpricot_scan/hpricot_common.rl +76 -0
  15. data/ext/hpricot_scan/hpricot_css.c +3511 -0
  16. data/ext/hpricot_scan/hpricot_css.java.rl +155 -0
  17. data/ext/hpricot_scan/hpricot_css.rl +120 -0
  18. data/ext/hpricot_scan/hpricot_scan.c +6848 -0
  19. data/ext/hpricot_scan/hpricot_scan.h +79 -0
  20. data/ext/hpricot_scan/hpricot_scan.java.rl +1173 -0
  21. data/ext/hpricot_scan/hpricot_scan.rl +911 -0
  22. data/extras/hpricot.png +0 -0
  23. data/hpricot.gemspec +18 -0
  24. data/lib/hpricot/blankslate.rb +63 -0
  25. data/lib/hpricot/builder.rb +217 -0
  26. data/lib/hpricot/elements.rb +514 -0
  27. data/lib/hpricot/htmlinfo.rb +691 -0
  28. data/lib/hpricot/inspect.rb +103 -0
  29. data/lib/hpricot/modules.rb +40 -0
  30. data/lib/hpricot/parse.rb +38 -0
  31. data/lib/hpricot/tag.rb +219 -0
  32. data/lib/hpricot/tags.rb +164 -0
  33. data/lib/hpricot/traverse.rb +839 -0
  34. data/lib/hpricot/xchar.rb +95 -0
  35. data/lib/hpricot.rb +26 -0
  36. data/setup.rb +1585 -0
  37. data/test/files/basic.xhtml +17 -0
  38. data/test/files/boingboing.html +2266 -0
  39. data/test/files/cy0.html +3653 -0
  40. data/test/files/immob.html +400 -0
  41. data/test/files/pace_application.html +1320 -0
  42. data/test/files/tenderlove.html +16 -0
  43. data/test/files/uswebgen.html +220 -0
  44. data/test/files/utf8.html +1054 -0
  45. data/test/files/week9.html +1723 -0
  46. data/test/files/why.xml +19 -0
  47. data/test/load_files.rb +7 -0
  48. data/test/nokogiri-bench.rb +64 -0
  49. data/test/test_alter.rb +96 -0
  50. data/test/test_builder.rb +37 -0
  51. data/test/test_parser.rb +496 -0
  52. data/test/test_paths.rb +25 -0
  53. data/test/test_preserved.rb +88 -0
  54. data/test/test_xml.rb +28 -0
  55. metadata +106 -0
@@ -0,0 +1,95 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # The XChar library is provided courtesy of Sam Ruby (See
4
+ # http://intertwingly.net/stories/2005/09/28/xchar.rb)
5
+
6
+ # --------------------------------------------------------------------
7
+
8
+ ######################################################################
9
+ module Hpricot
10
+
11
+ ####################################################################
12
+ # XML Character converter, from Sam Ruby:
13
+ # (see http://intertwingly.net/stories/2005/09/28/xchar.rb).
14
+ #
15
+ module XChar # :nodoc:
16
+
17
+ # See
18
+ # http://intertwingly.net/stories/2004/04/14/i18n.html#CleaningWindows
19
+ # for details.
20
+ CP1252 = { # :nodoc:
21
+ 128 => 8364, # euro sign
22
+ 130 => 8218, # single low-9 quotation mark
23
+ 131 => 402, # latin small letter f with hook
24
+ 132 => 8222, # double low-9 quotation mark
25
+ 133 => 8230, # horizontal ellipsis
26
+ 134 => 8224, # dagger
27
+ 135 => 8225, # double dagger
28
+ 136 => 710, # modifier letter circumflex accent
29
+ 137 => 8240, # per mille sign
30
+ 138 => 352, # latin capital letter s with caron
31
+ 139 => 8249, # single left-pointing angle quotation mark
32
+ 140 => 338, # latin capital ligature oe
33
+ 142 => 381, # latin capital letter z with caron
34
+ 145 => 8216, # left single quotation mark
35
+ 146 => 8217, # right single quotation mark
36
+ 147 => 8220, # left double quotation mark
37
+ 148 => 8221, # right double quotation mark
38
+ 149 => 8226, # bullet
39
+ 150 => 8211, # en dash
40
+ 151 => 8212, # em dash
41
+ 152 => 732, # small tilde
42
+ 153 => 8482, # trade mark sign
43
+ 154 => 353, # latin small letter s with caron
44
+ 155 => 8250, # single right-pointing angle quotation mark
45
+ 156 => 339, # latin small ligature oe
46
+ 158 => 382, # latin small letter z with caron
47
+ 159 => 376, # latin capital letter y with diaeresis
48
+ }
49
+
50
+ # See http://www.w3.org/TR/REC-xml/#dt-chardata for details.
51
+ PREDEFINED = {
52
+ 34 => '"', # quotation mark
53
+ 38 => '&', # ampersand
54
+ 60 => '<', # left angle bracket
55
+ 62 => '>' # right angle bracket
56
+ }
57
+ PREDEFINED_U = PREDEFINED.inject({}) { |hsh, (k, v)| hsh[v] = k; hsh }
58
+
59
+ # See http://www.w3.org/TR/REC-xml/#charsets for details.
60
+ VALID = [
61
+ 0x9, 0xA, 0xD,
62
+ (0x20..0xD7FF),
63
+ (0xE000..0xFFFD),
64
+ (0x10000..0x10FFFF)
65
+ ]
66
+ end
67
+
68
+ class << self
69
+ # XML escaped version of chr
70
+ def xchr(str)
71
+ n = XChar::CP1252[str] || str
72
+ case n when *XChar::VALID
73
+ XChar::PREDEFINED[n] or (n<128 ? n.chr : "&##{n};")
74
+ else
75
+ '*'
76
+ end
77
+ end
78
+
79
+ # XML escaped version of to_s
80
+ def xs(str)
81
+ str.to_s.unpack('U*').map {|n| xchr(n)}.join # ASCII, UTF-8
82
+ rescue
83
+ str.to_s.unpack('C*').map {|n| xchr(n)}.join # ISO-8859-1, WIN-1252
84
+ end
85
+
86
+ # XML unescape
87
+ def uxs(str)
88
+ str.to_s.
89
+ gsub(/\&\w+;/) { |x| (XChar::PREDEFINED_U[x] || 63).chr }. # 63 = ?? (query char)
90
+ gsub(/\&\#(\d+);/) { [$1.to_i].pack("U*") }.
91
+ gsub(/\&\#x([0-9a-fA-F]+);/) { [$1.to_i(16)].pack("U*") }
92
+ end
93
+ end
94
+ end
95
+
data/lib/hpricot.rb ADDED
@@ -0,0 +1,26 @@
1
+ # == About hpricot.rb
2
+ #
3
+ # All of Hpricot's various part are loaded when you use <tt>require 'hpricot'</tt>.
4
+ #
5
+ # * hpricot_scan: the scanner (a C extension for Ruby) which turns an HTML stream into tokens.
6
+ # * hpricot/parse.rb: uses the scanner to sort through tokens and give you back a complete document object.
7
+ # * hpricot/tag.rb: sets up objects for the various types of elements in an HTML document.
8
+ # * hpricot/modules.rb: categorizes the various elements using mixins.
9
+ # * hpricot/traverse.rb: methods for searching documents.
10
+ # * hpricot/elements.rb: methods for dealing with a group of elements as an Hpricot::Elements list.
11
+ # * hpricot/inspect.rb: methods for displaying documents in a readable form.
12
+
13
+ # If available, Nikolai's UTF-8 library will ease use of utf-8 documents.
14
+ # See http://git.bitwi.se/ruby-character-encodings.git/.
15
+ begin
16
+ require 'encoding/character/utf-8'
17
+ rescue LoadError
18
+ end
19
+
20
+ require 'hpricot_scan'
21
+ require 'hpricot/tag'
22
+ require 'hpricot/modules'
23
+ require 'hpricot/traverse'
24
+ require 'hpricot/inspect'
25
+ require 'hpricot/parse'
26
+ require 'hpricot/builder'