webtranslateit-hpricot 0.9.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (55) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +15 -0
  3. data/CHANGELOG +122 -0
  4. data/COPYING +18 -0
  5. data/README.md +295 -0
  6. data/Rakefile +237 -0
  7. data/ext/fast_xs/FastXsService.java +1123 -0
  8. data/ext/fast_xs/extconf.rb +4 -0
  9. data/ext/fast_xs/fast_xs.c +210 -0
  10. data/ext/hpricot_scan/HpricotCss.java +850 -0
  11. data/ext/hpricot_scan/HpricotScanService.java +2085 -0
  12. data/ext/hpricot_scan/MANIFEST +0 -0
  13. data/ext/hpricot_scan/extconf.rb +9 -0
  14. data/ext/hpricot_scan/hpricot_common.rl +76 -0
  15. data/ext/hpricot_scan/hpricot_css.c +3511 -0
  16. data/ext/hpricot_scan/hpricot_css.java.rl +155 -0
  17. data/ext/hpricot_scan/hpricot_css.rl +120 -0
  18. data/ext/hpricot_scan/hpricot_scan.c +6848 -0
  19. data/ext/hpricot_scan/hpricot_scan.h +79 -0
  20. data/ext/hpricot_scan/hpricot_scan.java.rl +1173 -0
  21. data/ext/hpricot_scan/hpricot_scan.rl +911 -0
  22. data/extras/hpricot.png +0 -0
  23. data/hpricot.gemspec +18 -0
  24. data/lib/hpricot/blankslate.rb +63 -0
  25. data/lib/hpricot/builder.rb +217 -0
  26. data/lib/hpricot/elements.rb +514 -0
  27. data/lib/hpricot/htmlinfo.rb +691 -0
  28. data/lib/hpricot/inspect.rb +103 -0
  29. data/lib/hpricot/modules.rb +40 -0
  30. data/lib/hpricot/parse.rb +38 -0
  31. data/lib/hpricot/tag.rb +219 -0
  32. data/lib/hpricot/tags.rb +164 -0
  33. data/lib/hpricot/traverse.rb +839 -0
  34. data/lib/hpricot/xchar.rb +95 -0
  35. data/lib/hpricot.rb +26 -0
  36. data/setup.rb +1585 -0
  37. data/test/files/basic.xhtml +17 -0
  38. data/test/files/boingboing.html +2266 -0
  39. data/test/files/cy0.html +3653 -0
  40. data/test/files/immob.html +400 -0
  41. data/test/files/pace_application.html +1320 -0
  42. data/test/files/tenderlove.html +16 -0
  43. data/test/files/uswebgen.html +220 -0
  44. data/test/files/utf8.html +1054 -0
  45. data/test/files/week9.html +1723 -0
  46. data/test/files/why.xml +19 -0
  47. data/test/load_files.rb +7 -0
  48. data/test/nokogiri-bench.rb +64 -0
  49. data/test/test_alter.rb +96 -0
  50. data/test/test_builder.rb +37 -0
  51. data/test/test_parser.rb +496 -0
  52. data/test/test_paths.rb +25 -0
  53. data/test/test_preserved.rb +88 -0
  54. data/test/test_xml.rb +28 -0
  55. metadata +106 -0
@@ -0,0 +1,95 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # The XChar library is provided courtesy of Sam Ruby (See
4
+ # http://intertwingly.net/stories/2005/09/28/xchar.rb)
5
+
6
+ # --------------------------------------------------------------------
7
+
8
+ ######################################################################
9
+ module Hpricot
10
+
11
+ ####################################################################
12
+ # XML Character converter, from Sam Ruby:
13
+ # (see http://intertwingly.net/stories/2005/09/28/xchar.rb).
14
+ #
15
+ module XChar # :nodoc:
16
+
17
+ # See
18
+ # http://intertwingly.net/stories/2004/04/14/i18n.html#CleaningWindows
19
+ # for details.
20
+ CP1252 = { # :nodoc:
21
+ 128 => 8364, # euro sign
22
+ 130 => 8218, # single low-9 quotation mark
23
+ 131 => 402, # latin small letter f with hook
24
+ 132 => 8222, # double low-9 quotation mark
25
+ 133 => 8230, # horizontal ellipsis
26
+ 134 => 8224, # dagger
27
+ 135 => 8225, # double dagger
28
+ 136 => 710, # modifier letter circumflex accent
29
+ 137 => 8240, # per mille sign
30
+ 138 => 352, # latin capital letter s with caron
31
+ 139 => 8249, # single left-pointing angle quotation mark
32
+ 140 => 338, # latin capital ligature oe
33
+ 142 => 381, # latin capital letter z with caron
34
+ 145 => 8216, # left single quotation mark
35
+ 146 => 8217, # right single quotation mark
36
+ 147 => 8220, # left double quotation mark
37
+ 148 => 8221, # right double quotation mark
38
+ 149 => 8226, # bullet
39
+ 150 => 8211, # en dash
40
+ 151 => 8212, # em dash
41
+ 152 => 732, # small tilde
42
+ 153 => 8482, # trade mark sign
43
+ 154 => 353, # latin small letter s with caron
44
+ 155 => 8250, # single right-pointing angle quotation mark
45
+ 156 => 339, # latin small ligature oe
46
+ 158 => 382, # latin small letter z with caron
47
+ 159 => 376, # latin capital letter y with diaeresis
48
+ }
49
+
50
+ # See http://www.w3.org/TR/REC-xml/#dt-chardata for details.
51
+ PREDEFINED = {
52
+ 34 => '"', # quotation mark
53
+ 38 => '&', # ampersand
54
+ 60 => '<', # left angle bracket
55
+ 62 => '>' # right angle bracket
56
+ }
57
+ PREDEFINED_U = PREDEFINED.inject({}) { |hsh, (k, v)| hsh[v] = k; hsh }
58
+
59
+ # See http://www.w3.org/TR/REC-xml/#charsets for details.
60
+ VALID = [
61
+ 0x9, 0xA, 0xD,
62
+ (0x20..0xD7FF),
63
+ (0xE000..0xFFFD),
64
+ (0x10000..0x10FFFF)
65
+ ]
66
+ end
67
+
68
+ class << self
69
+ # XML escaped version of chr
70
+ def xchr(str)
71
+ n = XChar::CP1252[str] || str
72
+ case n when *XChar::VALID
73
+ XChar::PREDEFINED[n] or (n<128 ? n.chr : "&##{n};")
74
+ else
75
+ '*'
76
+ end
77
+ end
78
+
79
+ # XML escaped version of to_s
80
+ def xs(str)
81
+ str.to_s.unpack('U*').map {|n| xchr(n)}.join # ASCII, UTF-8
82
+ rescue
83
+ str.to_s.unpack('C*').map {|n| xchr(n)}.join # ISO-8859-1, WIN-1252
84
+ end
85
+
86
+ # XML unescape
87
+ def uxs(str)
88
+ str.to_s.
89
+ gsub(/\&\w+;/) { |x| (XChar::PREDEFINED_U[x] || 63).chr }. # 63 = ?? (query char)
90
+ gsub(/\&\#(\d+);/) { [$1.to_i].pack("U*") }.
91
+ gsub(/\&\#x([0-9a-fA-F]+);/) { [$1.to_i(16)].pack("U*") }
92
+ end
93
+ end
94
+ end
95
+
data/lib/hpricot.rb ADDED
@@ -0,0 +1,26 @@
1
+ # == About hpricot.rb
2
+ #
3
+ # All of Hpricot's various part are loaded when you use <tt>require 'hpricot'</tt>.
4
+ #
5
+ # * hpricot_scan: the scanner (a C extension for Ruby) which turns an HTML stream into tokens.
6
+ # * hpricot/parse.rb: uses the scanner to sort through tokens and give you back a complete document object.
7
+ # * hpricot/tag.rb: sets up objects for the various types of elements in an HTML document.
8
+ # * hpricot/modules.rb: categorizes the various elements using mixins.
9
+ # * hpricot/traverse.rb: methods for searching documents.
10
+ # * hpricot/elements.rb: methods for dealing with a group of elements as an Hpricot::Elements list.
11
+ # * hpricot/inspect.rb: methods for displaying documents in a readable form.
12
+
13
+ # If available, Nikolai's UTF-8 library will ease use of utf-8 documents.
14
+ # See http://git.bitwi.se/ruby-character-encodings.git/.
15
+ begin
16
+ require 'encoding/character/utf-8'
17
+ rescue LoadError
18
+ end
19
+
20
+ require 'hpricot_scan'
21
+ require 'hpricot/tag'
22
+ require 'hpricot/modules'
23
+ require 'hpricot/traverse'
24
+ require 'hpricot/inspect'
25
+ require 'hpricot/parse'
26
+ require 'hpricot/builder'