fastxml 0.1.91

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/ext/fastxml.h ADDED
@@ -0,0 +1,61 @@
1
+ /*
2
+ * $Id$
3
+ */
4
+
5
+ #ifndef fastxml_h
6
+ #define fastxml_h
7
+
8
+ #include <ruby.h>
9
+
10
+ #include <stdlib.h>
11
+ #include <stdio.h>
12
+ #include <string.h>
13
+ #include <assert.h>
14
+
15
+ #include <libxml/tree.h>
16
+ #include <libxml/parser.h>
17
+ #include <libxml/xpath.h>
18
+ #include <libxml/xpathInternals.h>
19
+ #include <libxslt/xslt.h>
20
+ #include <libxslt/xsltInternals.h>
21
+ #include <libxml/HTMLparser.h>
22
+
23
+ #ifndef XML_WITH_TREE
24
+ #define XML_WITH_TREE 2
25
+ #define XML_WITH_XPATH 16
26
+ #endif
27
+
28
+ #ifndef HTML_PARSE_RECOVER
29
+ #define HTML_PARSE_RECOVER 1
30
+ #endif
31
+
32
+ typedef struct {
33
+ xmlDocPtr doc;
34
+ xmlNodePtr node;
35
+ xmlNodePtr list;
36
+ xmlXPathObjectPtr xpath_obj;
37
+ xsltStylesheetPtr xslt;
38
+ int list_len;
39
+ } fxml_data_t;
40
+
41
+
42
+ #ifndef fastxml_c
43
+ RUBY_EXTERN VALUE rb_cFastXmlDoc;
44
+ RUBY_EXTERN VALUE rb_cFastXmlNode;
45
+ RUBY_EXTERN VALUE rb_cFastXmlAttrList;
46
+
47
+ RUBY_EXTERN VALUE rb_sValidateDtd;
48
+ RUBY_EXTERN VALUE rb_sForgivingParse;
49
+ RUBY_EXTERN VALUE rb_sHtmlParse;
50
+
51
+ RUBY_EXTERN ID s_readlines;
52
+ RUBY_EXTERN ID s_to_s;
53
+
54
+ RUBY_EXTERN VALUE fastxml_xpath_search(VALUE self, VALUE raw_xpath, VALUE blk);
55
+ RUBY_EXTERN VALUE fastxml_raw_node_to_obj(xmlNodePtr cur);
56
+ RUBY_EXTERN VALUE fastxml_nodeset_to_obj(xmlXPathObjectPtr xpath_obj, fxml_data_t *data);
57
+ RUBY_EXTERN VALUE fastxml_nodelist_to_obj(xmlNodePtr root, int len);
58
+ RUBY_EXTERN void fastxml_data_mark( fxml_data_t *data );
59
+ RUBY_EXTERN void fastxml_data_free( fxml_data_t *data );
60
+ #endif
61
+ #endif /* fastxml_h */
data/ext/fastxml.o ADDED
Binary file
@@ -0,0 +1,60 @@
1
+ /*
2
+ * $Id$
3
+ */
4
+
5
+ #include "fastxml.h"
6
+ #include "fastxml_node.h"
7
+ #include "fastxml_doc.h"
8
+ #include "fastxml_nodelist.h"
9
+ #include "fastxml_attrlist.h"
10
+
11
+ /* {{{ fastml_attr_list
12
+ */
13
+
14
+ VALUE fastxml_attrlist_initialize(VALUE self)
15
+ {
16
+ return self;
17
+ }
18
+
19
+ VALUE fastxml_attrlist_indexer(VALUE self, VALUE attr_name)
20
+ {
21
+ VALUE ret, dv, attr_raw_str;
22
+ fxml_data_t *data;
23
+ xmlChar *raw_ret, *name_str;
24
+
25
+ dv = rb_iv_get( self, "@lxml_doc" );
26
+ Data_Get_Struct( dv, fxml_data_t, data );
27
+
28
+ attr_raw_str = rb_funcall( attr_name, s_to_s, 0 );
29
+ name_str = (xmlChar*)StringValuePtr( attr_raw_str );
30
+ raw_ret = xmlGetProp( data->node, name_str );
31
+ if (raw_ret == NULL)
32
+ return Qnil;
33
+
34
+ ret = rb_str_new2( (const char*)raw_ret );
35
+ xmlFree( raw_ret );
36
+
37
+ return ret;
38
+ }
39
+
40
+ VALUE fastxml_attrlist_indexer_set(VALUE self, VALUE attr_name, VALUE attr_value)
41
+ {
42
+ VALUE dv, attr_raw_str;
43
+ fxml_data_t *data;
44
+ xmlChar *val, *name_str;
45
+
46
+ dv = rb_iv_get( self, "@lxml_doc" );
47
+ Data_Get_Struct( dv, fxml_data_t, data );
48
+
49
+ attr_raw_str = rb_funcall( attr_name, s_to_s, 0 );
50
+ name_str = (xmlChar*)StringValuePtr( attr_raw_str );
51
+ val = (xmlChar*)StringValuePtr( attr_value );
52
+
53
+ xmlSetProp( data->node, name_str, val );
54
+
55
+ return attr_value;
56
+ }
57
+
58
+
59
+ /* }}} fastxml_attr_list
60
+ */
@@ -0,0 +1,11 @@
1
+ /*
2
+ * $Id$
3
+ */
4
+
5
+ #ifndef fastxml_attrlist_h
6
+ #define fastxml_attrlist_h
7
+
8
+ RUBY_EXTERN VALUE fastxml_attrlist_initialize(VALUE self);
9
+ RUBY_EXTERN VALUE fastxml_attrlist_indexer(VALUE self, VALUE idx);
10
+ RUBY_EXTERN VALUE fastxml_attrlist_indexer_set(VALUE self, VALUE idx, VALUE val);
11
+ #endif
Binary file
data/ext/fastxml_doc.c ADDED
@@ -0,0 +1,190 @@
1
+ /*
2
+ * $Id$
3
+ */
4
+ #include "fastxml.h"
5
+ #include "fastxml_node.h"
6
+ #include "fastxml_doc.h"
7
+ #include "fastxml_nodelist.h"
8
+
9
+
10
+ /* {{{ fastxml_doc
11
+ */
12
+ VALUE fastxml_doc_inspect(VALUE self)
13
+ {
14
+ VALUE *argv;
15
+ argv = ALLOCA_N( VALUE, 3 );
16
+ argv[0] = rb_str_new2( "#<%s:0x%x %s>" );
17
+ argv[1] = CLASS_OF( self );
18
+ argv[2] = rb_obj_id( self );
19
+ argv[3] = fastxml_doc_to_s( self );
20
+
21
+ return rb_f_sprintf( 4, argv );
22
+ }
23
+
24
+ VALUE fastxml_doc_children(VALUE self)
25
+ {
26
+ VALUE dv;
27
+ fxml_data_t *data;
28
+
29
+ dv = rb_iv_get( self, "@lxml_doc" );
30
+ Data_Get_Struct( dv, fxml_data_t, data );
31
+
32
+ if (data->doc->children == NULL)
33
+ return Qnil;
34
+
35
+ return fastxml_nodelist_to_obj( data->doc->children, -1 );
36
+ }
37
+
38
+ VALUE fastxml_doc_stylesheet(VALUE self)
39
+ {
40
+ return rb_iv_get( self, "@lxml_style" );
41
+ }
42
+
43
+ VALUE fastxml_doc_stylesheet_set(VALUE self, VALUE style)
44
+ {
45
+ VALUE dv, xslt_doc;
46
+ fxml_data_t *data;
47
+
48
+ xslt_doc = rb_class_new_instance(1, &style, rb_cFastXmlDoc );
49
+
50
+ dv = rb_iv_get( xslt_doc, "@lxml_doc" );
51
+ Data_Get_Struct( dv, fxml_data_t, data );
52
+ data->xslt = xsltParseStylesheetDoc( data->doc );
53
+ rb_iv_set( self, "@lxml_style", xslt_doc );
54
+
55
+ return Qnil;
56
+ }
57
+
58
+ VALUE fastxml_doc_transform(VALUE self, VALUE xform)
59
+ {
60
+ VALUE ret, dv, xform_dv, ret_str, ret_dv;
61
+ fxml_data_t *my_data, *xf_data, *ret_data;
62
+ xmlDocPtr ret_doc;
63
+
64
+ if (xform == Qnil)
65
+ return Qnil;
66
+
67
+ dv = rb_iv_get( self, "@lxml_doc" );
68
+ Data_Get_Struct( dv, fxml_data_t, my_data );
69
+ xform_dv = rb_iv_get( xform, "@lxml_doc" );
70
+ Data_Get_Struct( xform_dv, fxml_data_t, xf_data );
71
+
72
+ if (xf_data->xslt == NULL)
73
+ return Qnil;
74
+
75
+ ret_doc = (xmlDocPtr)xsltApplyStylesheet( xf_data->xslt, my_data->doc, NULL );
76
+ ret_str = rb_str_new2( "<shouldNeverBeSeen/>" );
77
+ ret = rb_class_new_instance( 1, &ret_str, rb_cFastXmlDoc );
78
+ ret_dv = rb_iv_get( ret, "@lxml_doc" );
79
+ Data_Get_Struct( ret_dv, fxml_data_t, ret_data );
80
+ xmlFree( ret_data->doc );
81
+ ret_data->doc = ret_doc;
82
+
83
+ return ret;
84
+ }
85
+
86
+ VALUE fastxml_doc_search(VALUE self, VALUE raw_xpath, VALUE blk)
87
+ {
88
+ return fastxml_xpath_search( self, raw_xpath, blk );
89
+ }
90
+
91
+ VALUE fastxml_doc_to_s(VALUE self)
92
+ {
93
+ VALUE ret, dv;
94
+ xmlChar *xs;
95
+ fxml_data_t *data;
96
+ int xs_len;
97
+
98
+ dv = rb_iv_get( self, "@lxml_doc" );
99
+ Data_Get_Struct( dv, fxml_data_t, data );
100
+
101
+ xmlDocDumpFormatMemory( data->doc, &xs, &xs_len, 0 );
102
+
103
+ ret = rb_str_new( (const char*)xs, xs_len );
104
+ xmlFree( xs );
105
+
106
+ return ret;
107
+ }
108
+
109
+ VALUE fastxml_doc_root(VALUE self)
110
+ {
111
+ VALUE dv;
112
+ fxml_data_t *data;
113
+ xmlNodePtr root;
114
+
115
+ dv = rb_iv_get( self, "@lxml_doc" );
116
+ Data_Get_Struct( dv, fxml_data_t, data );
117
+
118
+ root = xmlDocGetRootElement( data->doc );
119
+
120
+ return fastxml_raw_node_to_obj( root );
121
+ }
122
+
123
+ VALUE fastxml_doc_initialize(int argc, VALUE* argv, VALUE self)
124
+ {
125
+ VALUE data_s, dv, lines, xml_doc_str, opts, blk;
126
+ fxml_data_t *data;
127
+ int parser_opts = XML_PARSE_NOERROR | XML_PARSE_NOWARNING;
128
+ short html_parser = 0;
129
+
130
+ if (rb_scan_args( argc, argv, "11&", &xml_doc_str, &opts, &blk ) == 0)
131
+ return Qnil; // error state
132
+
133
+ if (NIL_P(xml_doc_str)) {
134
+ rb_raise(rb_eArgError, "nil passed as xml document");
135
+ return Qnil;
136
+ }
137
+
138
+ if (opts != Qnil) {
139
+ if (rb_hash_aref(opts, rb_sValidateDtd) == Qtrue) {
140
+ parser_opts = parser_opts | XML_PARSE_DTDLOAD | XML_PARSE_DTDATTR | XML_PARSE_DTDVALID;
141
+ rb_iv_set( self, "@validate_dtd", Qtrue );
142
+ }
143
+
144
+ if (rb_hash_aref(opts, rb_sForgivingParse) == Qtrue) {
145
+ parser_opts = parser_opts | XML_PARSE_RECOVER;
146
+ rb_iv_set( self, "@forgiving", Qtrue );
147
+ }
148
+
149
+ if (rb_hash_aref(opts, rb_sHtmlParse) == Qtrue) {
150
+ html_parser = 1;
151
+ }
152
+ }
153
+
154
+ if (rb_respond_to( xml_doc_str, s_readlines )) {
155
+ lines = rb_funcall( xml_doc_str, s_readlines, 0 );
156
+ data_s = rb_funcall( lines, s_to_s, 0 );
157
+ }
158
+ else
159
+ data_s = rb_obj_as_string( xml_doc_str );
160
+
161
+ rb_iv_set( self, "@raw_data", data_s );
162
+
163
+ data = ALLOC(fxml_data_t);
164
+ memset( data, (int)NULL, sizeof(fxml_data_t) );
165
+
166
+ if (html_parser == 0)
167
+ data->doc = xmlReadMemory( RSTRING(data_s)->ptr, RSTRING(data_s)->len,
168
+ "noname.xml", NULL, parser_opts );
169
+ else
170
+ data->doc = htmlReadMemory( RSTRING(data_s)->ptr, RSTRING(data_s)->len,
171
+ "noname.html", NULL, HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING );
172
+
173
+ // if we're mallformed we might want to use xmlRecoverMemcory(char*, int)
174
+ if (data->doc == NULL) {
175
+ rb_raise( rb_eRuntimeError, "Failed to parse document" );
176
+ return Qnil;
177
+ }
178
+
179
+ dv = Data_Wrap_Struct( rb_cObject, fastxml_data_mark, fastxml_data_free, data );
180
+ rb_iv_set(self, "@lxml_doc", dv );
181
+
182
+ if (blk != Qnil)
183
+ rb_yield( self );
184
+
185
+ return self;
186
+ }
187
+
188
+
189
+ /* }}} fastxml_doc
190
+ */
data/ext/fastxml_doc.h ADDED
@@ -0,0 +1,16 @@
1
+ /*
2
+ * $Id$
3
+ */
4
+
5
+ #ifndef fastxml_doc_h
6
+ #define fastxml_doc_h
7
+ RUBY_EXTERN VALUE fastxml_doc_initialize(int argc, VALUE* argv, VALUE self);
8
+ RUBY_EXTERN VALUE fastxml_doc_search(VALUE self, VALUE raw_xpath, VALUE blk);
9
+ RUBY_EXTERN VALUE fastxml_doc_to_s(VALUE self);
10
+ RUBY_EXTERN VALUE fastxml_doc_root(VALUE self);
11
+ RUBY_EXTERN VALUE fastxml_doc_transform(VALUE self, VALUE xform);
12
+ RUBY_EXTERN VALUE fastxml_doc_stylesheet(VALUE self);
13
+ RUBY_EXTERN VALUE fastxml_doc_stylesheet_set(VALUE self, VALUE style);
14
+ RUBY_EXTERN VALUE fastxml_doc_children(VALUE self);
15
+ RUBY_EXTERN VALUE fastxml_doc_inspect(VALUE self);
16
+ #endif
data/ext/fastxml_doc.o ADDED
Binary file
@@ -0,0 +1,240 @@
1
+ /*
2
+ * $Id$
3
+ */
4
+
5
+ #include "fastxml.h"
6
+ #include "fastxml_node.h"
7
+ #include "fastxml_doc.h"
8
+ #include "fastxml_nodelist.h"
9
+
10
+ /* {{{ fastxml_node
11
+ */
12
+
13
+ VALUE fastxml_node_inspect(VALUE self)
14
+ {
15
+ VALUE dv;
16
+ VALUE *argv;
17
+ fxml_data_t *data;
18
+
19
+ dv = rb_iv_get( self, "@lxml_doc" );
20
+ Data_Get_Struct( dv, fxml_data_t, data );
21
+
22
+ argv = ALLOCA_N( VALUE, 4 );
23
+ argv[0] = rb_str_new2( "#<%s:0x%x %s>" );
24
+ argv[1] = CLASS_OF( self );
25
+ argv[2] = rb_obj_id( self );
26
+ argv[3] = rb_str_new2( (char*) data->node->name );
27
+ return rb_f_sprintf( 4, argv );
28
+ }
29
+
30
+ VALUE fastxml_node_initialize(VALUE self)
31
+ {
32
+ return self;
33
+ }
34
+
35
+ VALUE fastxml_node_innerxml(VALUE self)
36
+ {
37
+ VALUE dv, ret;
38
+ fxml_data_t *data;
39
+ xmlBufferPtr buf = xmlBufferCreate();
40
+
41
+ dv = rb_iv_get( self, "@lxml_doc" );
42
+ Data_Get_Struct( dv, fxml_data_t, data );
43
+
44
+ xmlNodeDump( buf, data->doc, data->node, 0, 0 );
45
+ ret = rb_str_new2( (char*)xmlBufferContent( buf ) );
46
+ xmlBufferFree( buf );
47
+
48
+ return ret;
49
+ }
50
+
51
+ VALUE fastxml_node_next(VALUE self)
52
+ {
53
+ VALUE dv, next;
54
+ fxml_data_t *data;
55
+
56
+ dv = rb_iv_get( self, "@lxml_doc" );
57
+ Data_Get_Struct( dv, fxml_data_t, data );
58
+
59
+ if (data->node == NULL || (data->node != NULL && data->node->next == NULL))
60
+ return Qnil;
61
+
62
+ next = rb_iv_get( self, "@next" );
63
+ if (next == Qnil) {
64
+ next = fastxml_raw_node_to_obj( data->node->next );
65
+ rb_iv_set( self, "@next", next );
66
+ }
67
+
68
+ return next;
69
+ }
70
+
71
+ VALUE fastxml_node_prev(VALUE self)
72
+ {
73
+ VALUE dv, prev;
74
+ fxml_data_t *data;
75
+
76
+ dv = rb_iv_get( self, "@lxml_doc" );
77
+ Data_Get_Struct( dv, fxml_data_t, data );
78
+
79
+ if (data->node == NULL || (data->node != NULL && data->node->prev == NULL))
80
+ return Qnil;
81
+
82
+ prev = rb_iv_get( self, "@prev" );
83
+ if (prev == Qnil) {
84
+ prev = fastxml_raw_node_to_obj( data->node->prev );
85
+ rb_iv_set( self, "@prev", prev );
86
+ }
87
+
88
+ return prev;
89
+ }
90
+
91
+ VALUE fastxml_node_parent(VALUE self)
92
+ {
93
+ VALUE dv;
94
+ fxml_data_t *data;
95
+
96
+ dv = rb_iv_get( self, "@lxml_doc" );
97
+ Data_Get_Struct( dv, fxml_data_t, data );
98
+
99
+ if (data->node == NULL || (data->node != NULL && data->node->parent == NULL))
100
+ return Qnil;
101
+
102
+ return fastxml_raw_node_to_obj( data->node->parent );
103
+ }
104
+
105
+ VALUE fastxml_node_children(VALUE self)
106
+ {
107
+ VALUE dv;
108
+ fxml_data_t *data;
109
+
110
+ dv = rb_iv_get( self, "@lxml_doc" );
111
+ Data_Get_Struct( dv, fxml_data_t, data );
112
+
113
+ if (data->node == NULL || (data->node != NULL && data->node->children == NULL))
114
+ return Qnil;
115
+
116
+ return fastxml_nodelist_to_obj( data->node->children, -1 );
117
+ }
118
+
119
+ VALUE fastxml_node_name(VALUE self)
120
+ {
121
+ VALUE ret, dv;
122
+ fxml_data_t *data;
123
+
124
+ dv = rb_iv_get( self, "@lxml_doc" );
125
+ Data_Get_Struct( dv, fxml_data_t, data );
126
+
127
+ if (data->node == NULL || (data->node != NULL && data->node->name == NULL))
128
+ return Qnil;
129
+
130
+ ret = rb_str_new2( (const char*)data->node->name );
131
+
132
+ return ret;
133
+ }
134
+
135
+
136
+ VALUE fastxml_node_attr(VALUE self)
137
+ {
138
+ VALUE self_dv, ret;
139
+ fxml_data_t *data;
140
+ xmlChar *raw_ret;
141
+
142
+ ret = rb_iv_get( self, "@attrs" );
143
+ if (ret == Qnil) {
144
+ self_dv = rb_iv_get( self, "@lxml_doc" );
145
+ Data_Get_Struct( self_dv, fxml_data_t, data );
146
+ ret = rb_class_new_instance( 0, 0, rb_cFastXmlAttrList );
147
+
148
+ rb_iv_set( ret, "@lxml_doc", self_dv );
149
+ rb_iv_set( self, "@attrs", ret );
150
+ }
151
+
152
+ return ret;
153
+ }
154
+
155
+
156
+ VALUE fastxml_node_xpath(VALUE self)
157
+ {
158
+ VALUE ret, dv;
159
+ fxml_data_t *data;
160
+ xmlChar *raw_ret;
161
+
162
+ dv = rb_iv_get( self, "@lxml_doc" );
163
+ Data_Get_Struct( dv, fxml_data_t, data );
164
+
165
+ raw_ret = xmlGetNodePath( data->node );
166
+ if (raw_ret == NULL)
167
+ return Qnil;
168
+
169
+ ret = rb_str_new2( (const char*)raw_ret );
170
+ xmlFree( raw_ret );
171
+
172
+ return ret;
173
+ }
174
+
175
+ VALUE fastxml_node_value_set(VALUE self, VALUE new_val)
176
+ {
177
+ VALUE dv, val_s;
178
+ fxml_data_t *data;
179
+ xmlChar *ents, *spec;
180
+
181
+
182
+ val_s = rb_obj_as_string( new_val );
183
+ dv = rb_iv_get( self, "@lxml_doc" );
184
+ Data_Get_Struct( dv, fxml_data_t, data );
185
+
186
+ ents = xmlEncodeEntitiesReentrant( data->doc, (const xmlChar*)StringValuePtr(val_s) );
187
+ spec = xmlEncodeSpecialChars( data->doc, ents );
188
+
189
+ xmlNodeSetContent( data->node, spec );
190
+ xmlFree( ents );
191
+
192
+ return new_val;
193
+ }
194
+
195
+ VALUE fastxml_node_value(VALUE self)
196
+ {
197
+ VALUE ret, dv;
198
+ fxml_data_t *data;
199
+ xmlChar *cont;
200
+
201
+ dv = rb_iv_get( self, "@lxml_doc" );
202
+ Data_Get_Struct( dv, fxml_data_t, data );
203
+
204
+ cont = xmlNodeGetContent( data->node );
205
+
206
+ if (cont == NULL)
207
+ return Qnil;
208
+
209
+ ret = rb_str_new2( (const char*)cont );
210
+
211
+ return ret;
212
+ }
213
+
214
+ VALUE fastxml_node_to_s(VALUE self)
215
+ {
216
+ VALUE ret, dv;
217
+ fxml_data_t *data;
218
+ xmlBufferPtr buf;
219
+
220
+ dv = rb_iv_get( self, "@lxml_doc" );
221
+ Data_Get_Struct( dv, fxml_data_t, data );
222
+
223
+ buf = xmlBufferCreate();
224
+ ret = Qnil;
225
+
226
+ if (xmlNodeDump(buf, data->doc, data->node, 0, 0) != -1)
227
+ ret = rb_str_new( (const char*)buf->content, buf->use );
228
+
229
+ xmlBufferFree( buf );
230
+ return ret;
231
+ }
232
+
233
+ VALUE fastxml_node_search(VALUE self, VALUE raw_xpath, VALUE blk)
234
+ {
235
+ return fastxml_xpath_search( self, raw_xpath, blk );
236
+ }
237
+
238
+
239
+ /* }}} fastxml_node
240
+ */