webtranslateit-hpricot 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +15 -0
  3. data/CHANGELOG +122 -0
  4. data/COPYING +18 -0
  5. data/README.md +295 -0
  6. data/Rakefile +237 -0
  7. data/ext/fast_xs/FastXsService.java +1123 -0
  8. data/ext/fast_xs/extconf.rb +4 -0
  9. data/ext/fast_xs/fast_xs.c +210 -0
  10. data/ext/hpricot_scan/HpricotCss.java +850 -0
  11. data/ext/hpricot_scan/HpricotScanService.java +2085 -0
  12. data/ext/hpricot_scan/MANIFEST +0 -0
  13. data/ext/hpricot_scan/extconf.rb +9 -0
  14. data/ext/hpricot_scan/hpricot_common.rl +76 -0
  15. data/ext/hpricot_scan/hpricot_css.c +3511 -0
  16. data/ext/hpricot_scan/hpricot_css.java.rl +155 -0
  17. data/ext/hpricot_scan/hpricot_css.rl +120 -0
  18. data/ext/hpricot_scan/hpricot_scan.c +6848 -0
  19. data/ext/hpricot_scan/hpricot_scan.h +79 -0
  20. data/ext/hpricot_scan/hpricot_scan.java.rl +1173 -0
  21. data/ext/hpricot_scan/hpricot_scan.rl +911 -0
  22. data/extras/hpricot.png +0 -0
  23. data/hpricot.gemspec +18 -0
  24. data/lib/hpricot/blankslate.rb +63 -0
  25. data/lib/hpricot/builder.rb +217 -0
  26. data/lib/hpricot/elements.rb +514 -0
  27. data/lib/hpricot/htmlinfo.rb +691 -0
  28. data/lib/hpricot/inspect.rb +103 -0
  29. data/lib/hpricot/modules.rb +40 -0
  30. data/lib/hpricot/parse.rb +38 -0
  31. data/lib/hpricot/tag.rb +219 -0
  32. data/lib/hpricot/tags.rb +164 -0
  33. data/lib/hpricot/traverse.rb +839 -0
  34. data/lib/hpricot/xchar.rb +95 -0
  35. data/lib/hpricot.rb +26 -0
  36. data/setup.rb +1585 -0
  37. data/test/files/basic.xhtml +17 -0
  38. data/test/files/boingboing.html +2266 -0
  39. data/test/files/cy0.html +3653 -0
  40. data/test/files/immob.html +400 -0
  41. data/test/files/pace_application.html +1320 -0
  42. data/test/files/tenderlove.html +16 -0
  43. data/test/files/uswebgen.html +220 -0
  44. data/test/files/utf8.html +1054 -0
  45. data/test/files/week9.html +1723 -0
  46. data/test/files/why.xml +19 -0
  47. data/test/load_files.rb +7 -0
  48. data/test/nokogiri-bench.rb +64 -0
  49. data/test/test_alter.rb +96 -0
  50. data/test/test_builder.rb +37 -0
  51. data/test/test_parser.rb +496 -0
  52. data/test/test_paths.rb +25 -0
  53. data/test/test_preserved.rb +88 -0
  54. data/test/test_xml.rb +28 -0
  55. metadata +106 -0
@@ -0,0 +1,4 @@
1
+ require 'mkmf'
2
+ have_header('stdio.h') or exit
3
+ dir_config('fast_xs')
4
+ create_makefile('fast_xs')
@@ -0,0 +1,210 @@
1
+ #include <ruby.h>
2
+ #include <assert.h>
3
+
4
+ #ifdef HAVE_RUBY_ENCODING_H
5
+ #include <ruby/encoding.h>
6
+ # define ASSOCIATE_INDEX(s,enc) rb_enc_associate_index((s), rb_enc_to_index(enc))
7
+ #else
8
+ # define ASSOCIATE_INDEX(s,enc)
9
+ #endif
10
+
11
+ #ifndef RARRAY_LEN
12
+ #define RARRAY_LEN(arr) RARRAY(arr)->len
13
+ #define RARRAY_PTR(arr) RARRAY(arr)->ptr
14
+ #define RSTRING_LEN(str) RSTRING(str)->len
15
+ #define RSTRING_PTR(str) RSTRING(str)->ptr
16
+ #endif
17
+
18
+ static ID unpack_id;
19
+ static VALUE U_fmt, C_fmt;
20
+
21
+ /* give GCC hints for better branch prediction
22
+ * (we layout branches so that ASCII characters are handled faster) */
23
+ #if defined(__GNUC__) && (__GNUC__ >= 3)
24
+ # define likely(x) __builtin_expect (!!(x), 1)
25
+ # define unlikely(x) __builtin_expect (!!(x), 0)
26
+ #else
27
+ # define unlikely(x) (x)
28
+ # define likely(x) (x)
29
+ #endif
30
+
31
+ /* pass-through certain characters for CP-1252 */
32
+ #define p(x) (x-128)
33
+
34
+ static const int cp_1252[] = {
35
+ 8364, /* 128 => 8364, euro sign */
36
+ p(129), /* 129 => 129, pass-through */
37
+ 8218, /* 130 => 8218, single low-9 quotation mark */
38
+ 402, /* 131 => 402, latin small letter f with hook */
39
+ 8222, /* 132 => 8222, double low-9 quotation mark */
40
+ 8230, /* 133 => 8230, horizontal ellipsis */
41
+ 8224, /* 134 => 8224, dagger */
42
+ 8225, /* 135 => 8225, double dagger */
43
+ 710, /* 136 => 710, modifier letter circumflex accent */
44
+ 8240, /* 137 => 8240, per mille sign */
45
+ 352, /* 138 => 352, latin capital letter s with caron */
46
+ 8249, /* 139 => 8249, single left-pointing angle quotation mark */
47
+ 338, /* 140 => 338, latin capital ligature oe */
48
+ p(141), /* 141 => 141, pass-through */
49
+ 381, /* 142 => 381, latin capital letter z with caron */
50
+ p(143), /* 143 => 143, pass-through */
51
+ p(144), /* 144 => 144, pass-through */
52
+ 8216, /* 145 => 8216, left single quotation mark */
53
+ 8217, /* 146 => 8217, right single quotation mark */
54
+ 8220, /* 147 => 8220, left double quotation mark */
55
+ 8221, /* 148 => 8221, right double quotation mark */
56
+ 8226, /* 149 => 8226, bullet */
57
+ 8211, /* 150 => 8211, en dash */
58
+ 8212, /* 151 => 8212, em dash */
59
+ 732, /* 152 => 732, small tilde */
60
+ 8482, /* 153 => 8482, trade mark sign */
61
+ 353, /* 154 => 353, latin small letter s with caron */
62
+ 8250, /* 155 => 8250, single right-pointing angle quotation mark */
63
+ 339, /* 156 => 339, latin small ligature oe */
64
+ p(157), /* 157 => 157, pass-through */
65
+ 382, /* 158 => 382, latin small letter z with caron */
66
+ 376 /* 159 => 376} latin capital letter y with diaeresis */
67
+ };
68
+
69
+ #define VALID_VALUE(n) \
70
+ (n >= 0x20 && n <= 0xD7FF) || \
71
+ (n >= 0xE000 && n <= 0xFFFD) || \
72
+ (n >= 0x10000 && n <= 0x10FFFF)
73
+
74
+ #define CP_1252_ESCAPE(n) do { \
75
+ if (n >= 128 && n <= 159) \
76
+ n = cp_1252[n - 128]; \
77
+ } while(0)
78
+
79
+ static inline size_t bytes_for(int n)
80
+ {
81
+ if (n < 1000)
82
+ return sizeof("&#999;") - 1;
83
+ if (n < 10000)
84
+ return sizeof("&#9999;") - 1;
85
+ if (n < 100000)
86
+ return sizeof("&#99999;") - 1;
87
+ if (n < 1000000)
88
+ return sizeof("&#999999;") - 1;
89
+ /* if (n < 10000000), we won't have cases above 0x10FFFF */
90
+ return sizeof("&#9999999;") - 1;
91
+ }
92
+
93
+ static size_t escape(char *buf, int n)
94
+ {
95
+
96
+ #define return_const_len(x) do { \
97
+ memcpy(buf, x, sizeof(x) - 1); \
98
+ return (sizeof(x) - 1); \
99
+ } while (0)
100
+
101
+ /* handle ASCII first */
102
+ if (likely(n < 128)) {
103
+ if (likely(n >= 0x20 || n == '\t' || n == '\n' || n == '\r')) {
104
+ if (unlikely(n == '"'))
105
+ return_const_len("&quot;");
106
+ if (unlikely(n == '&'))
107
+ return_const_len("&amp;");
108
+ if (unlikely(n == '<'))
109
+ return_const_len("&lt;");
110
+ if (unlikely(n == '>'))
111
+ return_const_len("&gt;");
112
+ buf[0] = (char)n;
113
+ return 1;
114
+ }
115
+
116
+ buf[0] = '*';
117
+ return 1;
118
+ }
119
+
120
+ #undef return_const_len
121
+
122
+ CP_1252_ESCAPE(n);
123
+
124
+ if (VALID_VALUE(n)) {
125
+ /* return snprintf(buf, sizeof("&#1114111;"), "&#%i;", n); */
126
+ static const char digitmap[] = "0123456789";
127
+ size_t rv = sizeof("&#;") - 1;
128
+ buf += bytes_for(n);
129
+ *--buf = ';';
130
+ do {
131
+ *--buf = digitmap[(int)(n % 10)];
132
+ ++rv;
133
+ } while (n /= 10);
134
+ *--buf = '#';
135
+ *--buf = '&';
136
+ return rv;
137
+ }
138
+ buf[0] = '*';
139
+ return 1;
140
+ }
141
+
142
+ static VALUE unpack_utf8(VALUE self)
143
+ {
144
+ return rb_funcall(self, unpack_id, 1, U_fmt);
145
+ }
146
+
147
+ static VALUE unpack_uchar(VALUE self, VALUE exc)
148
+ {
149
+ return rb_funcall(self, unpack_id, 1, C_fmt);
150
+ }
151
+
152
+ /*
153
+ * escapes strings for XML
154
+ * The double-quote (") character is translated to "&quot;"
155
+ */
156
+ static VALUE fast_xs(VALUE self)
157
+ {
158
+ long i;
159
+ VALUE array;
160
+ char *c;
161
+ size_t s_len;
162
+ VALUE *tmp;
163
+ VALUE rv;
164
+
165
+ array = rb_rescue(unpack_utf8, self, unpack_uchar, self);
166
+
167
+ for (tmp = RARRAY_PTR(array), s_len = i = RARRAY_LEN(array);
168
+ --i >= 0;
169
+ tmp++) {
170
+ int n = NUM2INT(*tmp);
171
+ if (likely(n < 128)) {
172
+ if (unlikely(n == '"'))
173
+ s_len += (sizeof("&quot;") - 2);
174
+ if (unlikely(n == '&'))
175
+ s_len += (sizeof("&amp;") - 2);
176
+ if (unlikely(n == '>' || n == '<'))
177
+ s_len += (sizeof("&gt;") - 2);
178
+ continue;
179
+ }
180
+
181
+ CP_1252_ESCAPE(n);
182
+
183
+ if (VALID_VALUE(n))
184
+ s_len += bytes_for(n) - 1;
185
+ }
186
+
187
+ rv = rb_str_new(NULL, s_len);
188
+ ASSOCIATE_INDEX(rv, rb_default_external_encoding());
189
+ c = RSTRING_PTR(rv);
190
+
191
+ for (tmp = RARRAY_PTR(array), i = RARRAY_LEN(array); --i >= 0; tmp++)
192
+ c += escape(c, NUM2INT(*tmp));
193
+
194
+ return rv;
195
+ }
196
+
197
+ void Init_fast_xs(void)
198
+ {
199
+ assert(cp_1252[159 - 128] == 376); /* just in case I skipped a line */
200
+
201
+ unpack_id = rb_intern("unpack");
202
+ U_fmt = rb_str_new("U*", 2);
203
+ ASSOCIATE_INDEX(U_fmt, rb_ascii8bit_encoding());
204
+ C_fmt = rb_str_new("C*", 2);
205
+ ASSOCIATE_INDEX(C_fmt, rb_ascii8bit_encoding());
206
+ rb_global_variable(&U_fmt);
207
+ rb_global_variable(&C_fmt);
208
+
209
+ rb_define_method(rb_cString, "fast_xs", fast_xs, 0);
210
+ }