webtranslateit-hpricot 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +15 -0
  3. data/CHANGELOG +122 -0
  4. data/COPYING +18 -0
  5. data/README.md +295 -0
  6. data/Rakefile +237 -0
  7. data/ext/fast_xs/FastXsService.java +1123 -0
  8. data/ext/fast_xs/extconf.rb +4 -0
  9. data/ext/fast_xs/fast_xs.c +210 -0
  10. data/ext/hpricot_scan/HpricotCss.java +850 -0
  11. data/ext/hpricot_scan/HpricotScanService.java +2085 -0
  12. data/ext/hpricot_scan/MANIFEST +0 -0
  13. data/ext/hpricot_scan/extconf.rb +9 -0
  14. data/ext/hpricot_scan/hpricot_common.rl +76 -0
  15. data/ext/hpricot_scan/hpricot_css.c +3511 -0
  16. data/ext/hpricot_scan/hpricot_css.java.rl +155 -0
  17. data/ext/hpricot_scan/hpricot_css.rl +120 -0
  18. data/ext/hpricot_scan/hpricot_scan.c +6848 -0
  19. data/ext/hpricot_scan/hpricot_scan.h +79 -0
  20. data/ext/hpricot_scan/hpricot_scan.java.rl +1173 -0
  21. data/ext/hpricot_scan/hpricot_scan.rl +911 -0
  22. data/extras/hpricot.png +0 -0
  23. data/hpricot.gemspec +18 -0
  24. data/lib/hpricot/blankslate.rb +63 -0
  25. data/lib/hpricot/builder.rb +217 -0
  26. data/lib/hpricot/elements.rb +514 -0
  27. data/lib/hpricot/htmlinfo.rb +691 -0
  28. data/lib/hpricot/inspect.rb +103 -0
  29. data/lib/hpricot/modules.rb +40 -0
  30. data/lib/hpricot/parse.rb +38 -0
  31. data/lib/hpricot/tag.rb +219 -0
  32. data/lib/hpricot/tags.rb +164 -0
  33. data/lib/hpricot/traverse.rb +839 -0
  34. data/lib/hpricot/xchar.rb +95 -0
  35. data/lib/hpricot.rb +26 -0
  36. data/setup.rb +1585 -0
  37. data/test/files/basic.xhtml +17 -0
  38. data/test/files/boingboing.html +2266 -0
  39. data/test/files/cy0.html +3653 -0
  40. data/test/files/immob.html +400 -0
  41. data/test/files/pace_application.html +1320 -0
  42. data/test/files/tenderlove.html +16 -0
  43. data/test/files/uswebgen.html +220 -0
  44. data/test/files/utf8.html +1054 -0
  45. data/test/files/week9.html +1723 -0
  46. data/test/files/why.xml +19 -0
  47. data/test/load_files.rb +7 -0
  48. data/test/nokogiri-bench.rb +64 -0
  49. data/test/test_alter.rb +96 -0
  50. data/test/test_builder.rb +37 -0
  51. data/test/test_parser.rb +496 -0
  52. data/test/test_paths.rb +25 -0
  53. data/test/test_preserved.rb +88 -0
  54. data/test/test_xml.rb +28 -0
  55. metadata +106 -0
@@ -0,0 +1,155 @@
1
+ import java.io.IOException;
2
+
3
+ import org.jruby.Ruby;
4
+ import org.jruby.RubyArray;
5
+ import org.jruby.RubyClass;
6
+ import org.jruby.RubyHash;
7
+ import org.jruby.RubyModule;
8
+ import org.jruby.RubyNumeric;
9
+ import org.jruby.RubyObject;
10
+ import org.jruby.RubyObjectAdapter;
11
+ import org.jruby.RubyRegexp;
12
+ import org.jruby.RubyString;
13
+ import org.jruby.anno.JRubyMethod;
14
+ import org.jruby.exceptions.RaiseException;
15
+ import org.jruby.javasupport.JavaEmbedUtils;
16
+ import org.jruby.runtime.Arity;
17
+ import org.jruby.runtime.Block;
18
+ import org.jruby.runtime.ObjectAllocator;
19
+ import org.jruby.runtime.ThreadContext;
20
+ import org.jruby.runtime.builtin.IRubyObject;
21
+ import org.jruby.runtime.callback.Callback;
22
+ import org.jruby.exceptions.RaiseException;
23
+ import org.jruby.runtime.load.BasicLibraryService;
24
+ import org.jruby.util.ByteList;
25
+
26
+ public class HpricotCss {
27
+ public void FILTER(String id) {
28
+ IRubyObject[] args = new IRubyObject[fargs];
29
+ System.arraycopy(fvals, 0, args, 0, fargs);
30
+ mod.callMethod(ctx, id, args);
31
+ tmpt.rb_clear();
32
+ fargs = 1;
33
+ }
34
+
35
+ public void FILTERAUTO() {
36
+ try {
37
+ FILTER(new String(data, ts, te - ts, "ISO-8859-1"));
38
+ } catch(java.io.UnsupportedEncodingException e) {}
39
+ }
40
+
41
+ public void PUSH(int aps, int ape) {
42
+ RubyString str = RubyString.newString(runtime, data, aps, ape-aps);
43
+ fvals[fargs++] = str;
44
+ tmpt.append(str);
45
+ }
46
+
47
+ private IRubyObject self, mod, str, node;
48
+ private int cs, act, eof, p, pe, ts, te, aps, ape, aps2, ape2;
49
+ private byte[] data;
50
+
51
+ private int fargs = 1;
52
+ private IRubyObject[] fvals = new IRubyObject[6];
53
+ private RubyArray focus;
54
+ private RubyArray tmpt;
55
+ private Ruby runtime;
56
+ private ThreadContext ctx;
57
+
58
+ public HpricotCss(IRubyObject self, IRubyObject mod, IRubyObject str, IRubyObject node) {
59
+ this.self = self;
60
+ this.mod = mod;
61
+ this.str = str;
62
+ this.node = node;
63
+ this.runtime = self.getRuntime();
64
+ this.ctx = runtime.getCurrentContext();
65
+ this.focus = RubyArray.newArray(runtime, node);
66
+ this.tmpt = runtime.newArray();
67
+
68
+ fvals[0] = focus;
69
+
70
+ if(!(str instanceof RubyString)) {
71
+ throw runtime.newArgumentError("bad CSS selector, String only please.");
72
+ }
73
+
74
+ ByteList bl = ((RubyString)str).getByteList();
75
+
76
+ data = bl.bytes;
77
+ p = bl.begin;
78
+ pe = p + bl.realSize;
79
+ eof = pe;
80
+ }
81
+
82
+ %%{
83
+ machine hpricot_css;
84
+
85
+ action a {
86
+ aps = p;
87
+ }
88
+
89
+ action b {
90
+ ape = p;
91
+ PUSH(aps, ape);
92
+ }
93
+
94
+ action c {
95
+ ape = p;
96
+ aps2 = p;
97
+ }
98
+
99
+ action d {
100
+ ape2 = p;
101
+ PUSH(aps, ape);
102
+ PUSH(aps2, ape2);
103
+ }
104
+
105
+ commas = space* "," space*;
106
+ traverse = [>+~];
107
+ sdot = "\\.";
108
+ utfw = alnum | "_" | "-" |
109
+ (0xc4 0xa8..0xbf) | (0xc5..0xdf 0x80..0xbf) |
110
+ (0xe0..0xef 0x80..0xbf 0x80..0xbf) |
111
+ (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf);
112
+ utfword = utfw+;
113
+ utfname = (utfw | sdot)+;
114
+ quote1 = "'" [^']* "'";
115
+ quote2 = '"' [^"]* '"';
116
+
117
+ cssid = "#" %a utfname %b;
118
+ cssclass = "." %a utfname %b;
119
+ cssname = "[name=" %a utfname %b "]";
120
+ cssattr = "[" %a utfname %c space* [^ \n\t]? "=" %d space* (quote1 | quote2 | [^\]]+) "]";
121
+ csstag = utfname >a %b;
122
+ cssmod = ("even" | "odd" | (digit | "n" | "+" | "-")* );
123
+ csschild = ":" %a ("only" | "nth" | "last" | "first") "-child" %b ("(" %a cssmod %b ")")?;
124
+ csspos = ":" %a ("nth" | "eq" | "gt" | "lt" | "first" | "last" | "even" | "odd") %b ("(" %a digit+ %b ")")?;
125
+ pseudop = "(" [^)]+ ")";
126
+ pseudoq = "'" (pseudop+ | [^'()]*) "'" |
127
+ '"' (pseudop+ | [^"()]*) '"' |
128
+ (pseudop+ | [^"()]*);
129
+ pseudo = ":" %a utfname %b ("(" %a pseudoq %b ")")?;
130
+
131
+ main := |*
132
+ cssid => { FILTER("ID"); };
133
+ cssclass => { FILTER("CLASS"); };
134
+ cssname => { FILTER("NAME"); };
135
+ cssattr => { FILTER("ATTR"); };
136
+ csstag => { FILTER("TAG"); };
137
+ cssmod => { FILTER("MOD"); };
138
+ csschild => { FILTER("CHILD"); };
139
+ csspos => { FILTER("POS"); };
140
+ pseudo => { FILTER("PSUEDO"); };
141
+ commas => { focus = RubyArray.newArray(runtime, node); };
142
+ traverse => { FILTERAUTO(); };
143
+ space;
144
+ *|;
145
+
146
+ write data nofinal;
147
+ }%%
148
+
149
+ public IRubyObject scan() {
150
+ %% write init;
151
+ %% write exec;
152
+
153
+ return focus;
154
+ }
155
+ }
@@ -0,0 +1,120 @@
1
+ /*
2
+ * hpricot_css.rl
3
+ * ragel -C hpricot_css.rl -o hpricot_css.c
4
+ *
5
+ * Copyright (C) 2008 why the lucky stiff
6
+ */
7
+ #include <ruby.h>
8
+
9
+ #define FILTER(id) \
10
+ rb_funcall2(mod, rb_intern("" # id), fargs, fvals); \
11
+ rb_ary_clear(tmpt); \
12
+ fargs = 1
13
+ #define FILTERAUTO() \
14
+ char filt[10]; \
15
+ sprintf(filt, "%.*s", te - ts, ts); \
16
+ rb_funcall2(mod, rb_intern(filt), fargs, fvals); \
17
+ rb_ary_clear(tmpt); \
18
+ fargs = 1
19
+ #ifdef HAVE_RUBY_ENCODING_H
20
+ #define STRNEW(a, len) rb_external_str_new((a), (len))
21
+ #else
22
+ #define STRNEW(a, len) rb_str_new((a), (len))
23
+ #endif
24
+ #define PUSH(aps, ape) rb_ary_push(tmpt, fvals[fargs++] = STRNEW(aps, ape - aps))
25
+ #define P(id) printf(id ": %.*s\n", te - ts, ts);
26
+
27
+ %%{
28
+ machine hpricot_css;
29
+
30
+ action a {
31
+ aps = p;
32
+ }
33
+
34
+ action b {
35
+ ape = p;
36
+ PUSH(aps, ape);
37
+ }
38
+
39
+ action c {
40
+ ape = p;
41
+ aps2 = p;
42
+ }
43
+
44
+ action d {
45
+ ape2 = p;
46
+ PUSH(aps, ape);
47
+ PUSH(aps2, ape2);
48
+ }
49
+
50
+ commas = space* "," space*;
51
+ traverse = [>+~];
52
+ sdot = "\\.";
53
+ utfw = alnum | "_" | "-" |
54
+ (0xc4 0xa8..0xbf) | (0xc5..0xdf 0x80..0xbf) |
55
+ (0xe0..0xef 0x80..0xbf 0x80..0xbf) |
56
+ (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf);
57
+ utfword = utfw+;
58
+ utfname = (utfw | sdot)+;
59
+ quote1 = "'" [^']* "'";
60
+ quote2 = '"' [^"]* '"';
61
+
62
+ cssid = "#" %a utfname %b;
63
+ cssclass = "." %a utfname %b;
64
+ cssname = "[name=" %a utfname %b "]";
65
+ cssattr = "[" %a utfname %c space* [^ \n\t]? "=" %d space* (quote1 | quote2 | [^\]]+) "]";
66
+ csstag = utfname >a %b;
67
+ cssmod = ("even" | "odd" | (digit | "n" | "+" | "-")* );
68
+ csschild = ":" %a ("only" | "nth" | "last" | "first") "-child" %b ("(" %a cssmod %b ")")?;
69
+ csspos = ":" %a ("nth" | "eq" | "gt" | "lt" | "first" | "last" | "even" | "odd") %b ("(" %a digit+ %b ")")?;
70
+ pseudop = "(" [^)]+ ")";
71
+ pseudoq = "'" (pseudop+ | [^'()]*) "'" |
72
+ '"' (pseudop+ | [^"()]*) '"' |
73
+ (pseudop+ | [^"()]*);
74
+ pseudo = ":" %a utfname %b ("(" %a pseudoq %b ")")?;
75
+
76
+ main := |*
77
+ cssid => { FILTER(ID); };
78
+ cssclass => { FILTER(CLASS); };
79
+ cssname => { FILTER(NAME); };
80
+ cssattr => { FILTER(ATTR); };
81
+ csstag => { FILTER(TAG); };
82
+ cssmod => { FILTER(MOD); };
83
+ csschild => { FILTER(CHILD); };
84
+ csspos => { FILTER(POS); };
85
+ pseudo => { FILTER(PSUEDO); };
86
+ commas => { focus = rb_ary_new3(1, node); };
87
+ traverse => { FILTERAUTO(); };
88
+ space;
89
+ *|;
90
+
91
+ write data nofinal;
92
+ }%%
93
+
94
+ VALUE hpricot_css(VALUE self, VALUE mod, VALUE str, VALUE node)
95
+ {
96
+ int cs, act, eof;
97
+ char *p, *pe, *ts, *te, *aps, *ape, *aps2, *ape2;
98
+
99
+ int fargs = 1;
100
+ VALUE fvals[6];
101
+ VALUE focus = rb_ary_new3(1, node);
102
+ VALUE tmpt = rb_ary_new();
103
+ rb_gc_register_address(&focus);
104
+ rb_gc_register_address(&tmpt);
105
+ fvals[0] = focus;
106
+
107
+ if (TYPE(str) != T_STRING)
108
+ rb_raise(rb_eArgError, "bad CSS selector, String only please.");
109
+
110
+ StringValue(str);
111
+ p = RSTRING_PTR(str);
112
+ pe = p + RSTRING_LEN(str);
113
+
114
+ %% write init;
115
+ %% write exec;
116
+
117
+ rb_gc_unregister_address(&focus);
118
+ rb_gc_unregister_address(&tmpt);
119
+ return focus;
120
+ }