webtranslateit-hpricot 0.9.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (55) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +15 -0
  3. data/CHANGELOG +122 -0
  4. data/COPYING +18 -0
  5. data/README.md +295 -0
  6. data/Rakefile +237 -0
  7. data/ext/fast_xs/FastXsService.java +1123 -0
  8. data/ext/fast_xs/extconf.rb +4 -0
  9. data/ext/fast_xs/fast_xs.c +210 -0
  10. data/ext/hpricot_scan/HpricotCss.java +850 -0
  11. data/ext/hpricot_scan/HpricotScanService.java +2085 -0
  12. data/ext/hpricot_scan/MANIFEST +0 -0
  13. data/ext/hpricot_scan/extconf.rb +9 -0
  14. data/ext/hpricot_scan/hpricot_common.rl +76 -0
  15. data/ext/hpricot_scan/hpricot_css.c +3511 -0
  16. data/ext/hpricot_scan/hpricot_css.java.rl +155 -0
  17. data/ext/hpricot_scan/hpricot_css.rl +120 -0
  18. data/ext/hpricot_scan/hpricot_scan.c +6848 -0
  19. data/ext/hpricot_scan/hpricot_scan.h +79 -0
  20. data/ext/hpricot_scan/hpricot_scan.java.rl +1173 -0
  21. data/ext/hpricot_scan/hpricot_scan.rl +911 -0
  22. data/extras/hpricot.png +0 -0
  23. data/hpricot.gemspec +18 -0
  24. data/lib/hpricot/blankslate.rb +63 -0
  25. data/lib/hpricot/builder.rb +217 -0
  26. data/lib/hpricot/elements.rb +514 -0
  27. data/lib/hpricot/htmlinfo.rb +691 -0
  28. data/lib/hpricot/inspect.rb +103 -0
  29. data/lib/hpricot/modules.rb +40 -0
  30. data/lib/hpricot/parse.rb +38 -0
  31. data/lib/hpricot/tag.rb +219 -0
  32. data/lib/hpricot/tags.rb +164 -0
  33. data/lib/hpricot/traverse.rb +839 -0
  34. data/lib/hpricot/xchar.rb +95 -0
  35. data/lib/hpricot.rb +26 -0
  36. data/setup.rb +1585 -0
  37. data/test/files/basic.xhtml +17 -0
  38. data/test/files/boingboing.html +2266 -0
  39. data/test/files/cy0.html +3653 -0
  40. data/test/files/immob.html +400 -0
  41. data/test/files/pace_application.html +1320 -0
  42. data/test/files/tenderlove.html +16 -0
  43. data/test/files/uswebgen.html +220 -0
  44. data/test/files/utf8.html +1054 -0
  45. data/test/files/week9.html +1723 -0
  46. data/test/files/why.xml +19 -0
  47. data/test/load_files.rb +7 -0
  48. data/test/nokogiri-bench.rb +64 -0
  49. data/test/test_alter.rb +96 -0
  50. data/test/test_builder.rb +37 -0
  51. data/test/test_parser.rb +496 -0
  52. data/test/test_paths.rb +25 -0
  53. data/test/test_preserved.rb +88 -0
  54. data/test/test_xml.rb +28 -0
  55. metadata +106 -0
@@ -0,0 +1,4 @@
1
+ require 'mkmf'
2
+ have_header('stdio.h') or exit
3
+ dir_config('fast_xs')
4
+ create_makefile('fast_xs')
@@ -0,0 +1,210 @@
1
+ #include <ruby.h>
2
+ #include <assert.h>
3
+
4
+ #ifdef HAVE_RUBY_ENCODING_H
5
+ #include <ruby/encoding.h>
6
+ # define ASSOCIATE_INDEX(s,enc) rb_enc_associate_index((s), rb_enc_to_index(enc))
7
+ #else
8
+ # define ASSOCIATE_INDEX(s,enc)
9
+ #endif
10
+
11
+ #ifndef RARRAY_LEN
12
+ #define RARRAY_LEN(arr) RARRAY(arr)->len
13
+ #define RARRAY_PTR(arr) RARRAY(arr)->ptr
14
+ #define RSTRING_LEN(str) RSTRING(str)->len
15
+ #define RSTRING_PTR(str) RSTRING(str)->ptr
16
+ #endif
17
+
18
+ static ID unpack_id;
19
+ static VALUE U_fmt, C_fmt;
20
+
21
+ /* give GCC hints for better branch prediction
22
+ * (we layout branches so that ASCII characters are handled faster) */
23
+ #if defined(__GNUC__) && (__GNUC__ >= 3)
24
+ # define likely(x) __builtin_expect (!!(x), 1)
25
+ # define unlikely(x) __builtin_expect (!!(x), 0)
26
+ #else
27
+ # define unlikely(x) (x)
28
+ # define likely(x) (x)
29
+ #endif
30
+
31
+ /* pass-through certain characters for CP-1252 */
32
+ #define p(x) (x-128)
33
+
34
+ static const int cp_1252[] = {
35
+ 8364, /* 128 => 8364, euro sign */
36
+ p(129), /* 129 => 129, pass-through */
37
+ 8218, /* 130 => 8218, single low-9 quotation mark */
38
+ 402, /* 131 => 402, latin small letter f with hook */
39
+ 8222, /* 132 => 8222, double low-9 quotation mark */
40
+ 8230, /* 133 => 8230, horizontal ellipsis */
41
+ 8224, /* 134 => 8224, dagger */
42
+ 8225, /* 135 => 8225, double dagger */
43
+ 710, /* 136 => 710, modifier letter circumflex accent */
44
+ 8240, /* 137 => 8240, per mille sign */
45
+ 352, /* 138 => 352, latin capital letter s with caron */
46
+ 8249, /* 139 => 8249, single left-pointing angle quotation mark */
47
+ 338, /* 140 => 338, latin capital ligature oe */
48
+ p(141), /* 141 => 141, pass-through */
49
+ 381, /* 142 => 381, latin capital letter z with caron */
50
+ p(143), /* 143 => 143, pass-through */
51
+ p(144), /* 144 => 144, pass-through */
52
+ 8216, /* 145 => 8216, left single quotation mark */
53
+ 8217, /* 146 => 8217, right single quotation mark */
54
+ 8220, /* 147 => 8220, left double quotation mark */
55
+ 8221, /* 148 => 8221, right double quotation mark */
56
+ 8226, /* 149 => 8226, bullet */
57
+ 8211, /* 150 => 8211, en dash */
58
+ 8212, /* 151 => 8212, em dash */
59
+ 732, /* 152 => 732, small tilde */
60
+ 8482, /* 153 => 8482, trade mark sign */
61
+ 353, /* 154 => 353, latin small letter s with caron */
62
+ 8250, /* 155 => 8250, single right-pointing angle quotation mark */
63
+ 339, /* 156 => 339, latin small ligature oe */
64
+ p(157), /* 157 => 157, pass-through */
65
+ 382, /* 158 => 382, latin small letter z with caron */
66
+ 376 /* 159 => 376} latin capital letter y with diaeresis */
67
+ };
68
+
69
+ #define VALID_VALUE(n) \
70
+ (n >= 0x20 && n <= 0xD7FF) || \
71
+ (n >= 0xE000 && n <= 0xFFFD) || \
72
+ (n >= 0x10000 && n <= 0x10FFFF)
73
+
74
+ #define CP_1252_ESCAPE(n) do { \
75
+ if (n >= 128 && n <= 159) \
76
+ n = cp_1252[n - 128]; \
77
+ } while(0)
78
+
79
+ static inline size_t bytes_for(int n)
80
+ {
81
+ if (n < 1000)
82
+ return sizeof("&#999;") - 1;
83
+ if (n < 10000)
84
+ return sizeof("&#9999;") - 1;
85
+ if (n < 100000)
86
+ return sizeof("&#99999;") - 1;
87
+ if (n < 1000000)
88
+ return sizeof("&#999999;") - 1;
89
+ /* if (n < 10000000), we won't have cases above 0x10FFFF */
90
+ return sizeof("&#9999999;") - 1;
91
+ }
92
+
93
+ static size_t escape(char *buf, int n)
94
+ {
95
+
96
+ #define return_const_len(x) do { \
97
+ memcpy(buf, x, sizeof(x) - 1); \
98
+ return (sizeof(x) - 1); \
99
+ } while (0)
100
+
101
+ /* handle ASCII first */
102
+ if (likely(n < 128)) {
103
+ if (likely(n >= 0x20 || n == '\t' || n == '\n' || n == '\r')) {
104
+ if (unlikely(n == '"'))
105
+ return_const_len("&quot;");
106
+ if (unlikely(n == '&'))
107
+ return_const_len("&amp;");
108
+ if (unlikely(n == '<'))
109
+ return_const_len("&lt;");
110
+ if (unlikely(n == '>'))
111
+ return_const_len("&gt;");
112
+ buf[0] = (char)n;
113
+ return 1;
114
+ }
115
+
116
+ buf[0] = '*';
117
+ return 1;
118
+ }
119
+
120
+ #undef return_const_len
121
+
122
+ CP_1252_ESCAPE(n);
123
+
124
+ if (VALID_VALUE(n)) {
125
+ /* return snprintf(buf, sizeof("&#1114111;"), "&#%i;", n); */
126
+ static const char digitmap[] = "0123456789";
127
+ size_t rv = sizeof("&#;") - 1;
128
+ buf += bytes_for(n);
129
+ *--buf = ';';
130
+ do {
131
+ *--buf = digitmap[(int)(n % 10)];
132
+ ++rv;
133
+ } while (n /= 10);
134
+ *--buf = '#';
135
+ *--buf = '&';
136
+ return rv;
137
+ }
138
+ buf[0] = '*';
139
+ return 1;
140
+ }
141
+
142
+ static VALUE unpack_utf8(VALUE self)
143
+ {
144
+ return rb_funcall(self, unpack_id, 1, U_fmt);
145
+ }
146
+
147
+ static VALUE unpack_uchar(VALUE self, VALUE exc)
148
+ {
149
+ return rb_funcall(self, unpack_id, 1, C_fmt);
150
+ }
151
+
152
+ /*
153
+ * escapes strings for XML
154
+ * The double-quote (") character is translated to "&quot;"
155
+ */
156
+ static VALUE fast_xs(VALUE self)
157
+ {
158
+ long i;
159
+ VALUE array;
160
+ char *c;
161
+ size_t s_len;
162
+ VALUE *tmp;
163
+ VALUE rv;
164
+
165
+ array = rb_rescue(unpack_utf8, self, unpack_uchar, self);
166
+
167
+ for (tmp = RARRAY_PTR(array), s_len = i = RARRAY_LEN(array);
168
+ --i >= 0;
169
+ tmp++) {
170
+ int n = NUM2INT(*tmp);
171
+ if (likely(n < 128)) {
172
+ if (unlikely(n == '"'))
173
+ s_len += (sizeof("&quot;") - 2);
174
+ if (unlikely(n == '&'))
175
+ s_len += (sizeof("&amp;") - 2);
176
+ if (unlikely(n == '>' || n == '<'))
177
+ s_len += (sizeof("&gt;") - 2);
178
+ continue;
179
+ }
180
+
181
+ CP_1252_ESCAPE(n);
182
+
183
+ if (VALID_VALUE(n))
184
+ s_len += bytes_for(n) - 1;
185
+ }
186
+
187
+ rv = rb_str_new(NULL, s_len);
188
+ ASSOCIATE_INDEX(rv, rb_default_external_encoding());
189
+ c = RSTRING_PTR(rv);
190
+
191
+ for (tmp = RARRAY_PTR(array), i = RARRAY_LEN(array); --i >= 0; tmp++)
192
+ c += escape(c, NUM2INT(*tmp));
193
+
194
+ return rv;
195
+ }
196
+
197
+ void Init_fast_xs(void)
198
+ {
199
+ assert(cp_1252[159 - 128] == 376); /* just in case I skipped a line */
200
+
201
+ unpack_id = rb_intern("unpack");
202
+ U_fmt = rb_str_new("U*", 2);
203
+ ASSOCIATE_INDEX(U_fmt, rb_ascii8bit_encoding());
204
+ C_fmt = rb_str_new("C*", 2);
205
+ ASSOCIATE_INDEX(C_fmt, rb_ascii8bit_encoding());
206
+ rb_global_variable(&U_fmt);
207
+ rb_global_variable(&C_fmt);
208
+
209
+ rb_define_method(rb_cString, "fast_xs", fast_xs, 0);
210
+ }