tidy-ext 0.1.7

Sign up to get free protection for your applications and to get access to all the features.
Files changed (65) hide show
  1. data/.gitignore +4 -0
  2. data/LICENSE +50 -0
  3. data/README +12 -0
  4. data/Rakefile +60 -0
  5. data/VERSION +1 -0
  6. data/ext/tidy/access.c +3310 -0
  7. data/ext/tidy/access.h +279 -0
  8. data/ext/tidy/alloc.c +107 -0
  9. data/ext/tidy/attrask.c +209 -0
  10. data/ext/tidy/attrdict.c +2398 -0
  11. data/ext/tidy/attrdict.h +122 -0
  12. data/ext/tidy/attrget.c +213 -0
  13. data/ext/tidy/attrs.c +1911 -0
  14. data/ext/tidy/attrs.h +374 -0
  15. data/ext/tidy/buffio.c +232 -0
  16. data/ext/tidy/buffio.h +118 -0
  17. data/ext/tidy/charsets.c +1032 -0
  18. data/ext/tidy/charsets.h +14 -0
  19. data/ext/tidy/clean.c +2674 -0
  20. data/ext/tidy/clean.h +87 -0
  21. data/ext/tidy/config.c +1746 -0
  22. data/ext/tidy/config.h +153 -0
  23. data/ext/tidy/entities.c +419 -0
  24. data/ext/tidy/entities.h +24 -0
  25. data/ext/tidy/extconf.rb +5 -0
  26. data/ext/tidy/fileio.c +106 -0
  27. data/ext/tidy/fileio.h +46 -0
  28. data/ext/tidy/forward.h +69 -0
  29. data/ext/tidy/iconvtc.c +105 -0
  30. data/ext/tidy/iconvtc.h +15 -0
  31. data/ext/tidy/istack.c +373 -0
  32. data/ext/tidy/lexer.c +3825 -0
  33. data/ext/tidy/lexer.h +617 -0
  34. data/ext/tidy/localize.c +1882 -0
  35. data/ext/tidy/mappedio.c +329 -0
  36. data/ext/tidy/mappedio.h +16 -0
  37. data/ext/tidy/message.h +207 -0
  38. data/ext/tidy/parser.c +4408 -0
  39. data/ext/tidy/parser.h +76 -0
  40. data/ext/tidy/platform.h +636 -0
  41. data/ext/tidy/pprint.c +2276 -0
  42. data/ext/tidy/pprint.h +93 -0
  43. data/ext/tidy/ruby-tidy.c +195 -0
  44. data/ext/tidy/streamio.c +1407 -0
  45. data/ext/tidy/streamio.h +222 -0
  46. data/ext/tidy/tagask.c +286 -0
  47. data/ext/tidy/tags.c +955 -0
  48. data/ext/tidy/tags.h +235 -0
  49. data/ext/tidy/tidy-int.h +129 -0
  50. data/ext/tidy/tidy.h +1097 -0
  51. data/ext/tidy/tidyenum.h +622 -0
  52. data/ext/tidy/tidylib.c +1751 -0
  53. data/ext/tidy/tmbstr.c +306 -0
  54. data/ext/tidy/tmbstr.h +92 -0
  55. data/ext/tidy/utf8.c +539 -0
  56. data/ext/tidy/utf8.h +52 -0
  57. data/ext/tidy/version.h +14 -0
  58. data/ext/tidy/win32tc.c +795 -0
  59. data/ext/tidy/win32tc.h +19 -0
  60. data/spec/spec_helper.rb +5 -0
  61. data/spec/tidy/compat_spec.rb +44 -0
  62. data/spec/tidy/remote_uri_spec.rb +14 -0
  63. data/spec/tidy/test1.html +5 -0
  64. data/spec/tidy/tidy_spec.rb +34 -0
  65. metadata +125 -0
data/ext/tidy/pprint.h ADDED
@@ -0,0 +1,93 @@
1
+ #ifndef __PPRINT_H__
2
+ #define __PPRINT_H__
3
+
4
+ /* pprint.h -- pretty print parse tree
5
+
6
+ (c) 1998-2007 (W3C) MIT, ERCIM, Keio University
7
+ See tidy.h for the copyright notice.
8
+
9
+ CVS Info:
10
+ $Author: arnaud02 $
11
+ $Date: 2007/02/11 09:45:08 $
12
+ $Revision: 1.9 $
13
+
14
+ */
15
+
16
+ #include "forward.h"
17
+
18
+ /*
19
+ Block-level and unknown elements are printed on
20
+ new lines and their contents indented 2 spaces
21
+
22
+ Inline elements are printed inline.
23
+
24
+ Inline content is wrapped on spaces (except in
25
+ attribute values or preformatted text, after
26
+ start tags and before end tags
27
+ */
28
+
29
+ #define NORMAL 0u
30
+ #define PREFORMATTED 1u
31
+ #define COMMENT 2u
32
+ #define ATTRIBVALUE 4u
33
+ #define NOWRAP 8u
34
+ #define CDATA 16u
35
+
36
+
37
+ /* The pretty printer keeps at most two lines of text in the
38
+ ** buffer before flushing output. We need to capture the
39
+ ** indent state (indent level) at the _beginning_ of _each_
40
+ ** line, not the end of just the second line.
41
+ **
42
+ ** We must also keep track "In Attribute" and "In String"
43
+ ** states at the _end_ of each line,
44
+ */
45
+
46
+ typedef struct _TidyIndent
47
+ {
48
+ int spaces;
49
+ int attrValStart;
50
+ int attrStringStart;
51
+ } TidyIndent;
52
+
53
+ typedef struct _TidyPrintImpl
54
+ {
55
+ TidyAllocator *allocator; /* Allocator */
56
+
57
+ uint *linebuf;
58
+ uint lbufsize;
59
+ uint linelen;
60
+ uint wraphere;
61
+
62
+ uint ixInd;
63
+ TidyIndent indent[2]; /* Two lines worth of indent state */
64
+ } TidyPrintImpl;
65
+
66
+
67
+ #if 0 && SUPPORT_ASIAN_ENCODINGS
68
+ /* #431953 - start RJ Wraplen adjusted for smooth international ride */
69
+ uint CWrapLen( TidyDocImpl* doc, uint ind );
70
+ #endif
71
+
72
+ void TY_(InitPrintBuf)( TidyDocImpl* doc );
73
+ void TY_(FreePrintBuf)( TidyDocImpl* doc );
74
+
75
+ void TY_(PFlushLine)( TidyDocImpl* doc, uint indent );
76
+
77
+
78
+ /* print just the content of the body element.
79
+ ** useful when you want to reuse material from
80
+ ** other documents.
81
+ **
82
+ ** -- Sebastiano Vigna <vigna@dsi.unimi.it>
83
+ */
84
+
85
+ void TY_(PrintBody)( TidyDocImpl* doc ); /* you can print an entire document */
86
+ /* node as body using PPrintTree() */
87
+
88
+ void TY_(PPrintTree)( TidyDocImpl* doc, uint mode, uint indent, Node *node );
89
+
90
+ void TY_(PPrintXMLTree)( TidyDocImpl* doc, uint mode, uint indent, Node *node );
91
+
92
+
93
+ #endif /* __PPRINT_H__ */
@@ -0,0 +1,195 @@
1
+ /*
2
+ ruby-tidy.c - Ruby driver for HTML TidyLib
3
+ */
4
+ #include <ruby.h>
5
+ #include "tidy.h"
6
+ #include "buffio.h"
7
+
8
+ int TIDY_CALL rb_tidyGetByte( TidyInputSource* source )
9
+ {
10
+ VALUE data = (VALUE)source->sourceData;
11
+ VALUE value = rb_funcall(data, rb_intern("getc"), 0, NULL);
12
+ return NUM2INT(value);
13
+ }
14
+
15
+ void TIDY_CALL rb_tidyUngetByte( TidyInputSource* source, byte byteValue )
16
+ {
17
+ VALUE data = (VALUE)source->sourceData;
18
+ VALUE value = rb_funcall(data, rb_intern("ungetc"), 1, INT2NUM(byteValue));
19
+ }
20
+
21
+ Bool TIDY_CALL rb_tidyIsEOF( TidyInputSource* source )
22
+ {
23
+ VALUE data = (VALUE)source->sourceData;
24
+ VALUE value = rb_funcall(data, rb_intern("eof"), 0, NULL);
25
+ return value == Qfalse ? no : yes;
26
+ }
27
+
28
+ static VALUE cTidy;
29
+
30
+ /* release tidy doc memory */
31
+ static void rb_tidy_free(void *ptr)
32
+ {
33
+ tidyRelease(ptr);
34
+ }
35
+
36
+ /* create a new tidy doc */
37
+ /* TODO, observe :show_warnings=>true in hash */
38
+ static VALUE rb_tidy_new(VALUE class, VALUE hash)
39
+ {
40
+ VALUE argv[1];
41
+ TidyDoc tdoc = tidyCreate();
42
+ VALUE tdata = Data_Wrap_Struct(class, 0, rb_tidy_free, (struct _TidyDoc *)tdoc);
43
+ argv[0] = hash;
44
+
45
+ rb_obj_call_init(tdata, 0, NULL);
46
+ return tdata;
47
+ }
48
+
49
+ /* parse the given input and return the tidy errors and output */
50
+ static VALUE rb_tidy_parse(VALUE self, VALUE input)
51
+ {
52
+ VALUE array;
53
+ VALUE access;
54
+ VALUE errors;
55
+
56
+ TidyDoc tdoc;
57
+ TidyBuffer output;
58
+ TidyBuffer errbuf;
59
+ int status = 0;
60
+
61
+ int contentErrors = 0;
62
+ int contentWarnings = 0;
63
+ int accessWarnings = 0;
64
+
65
+ /* See platform.h, opaque_type for typedef convention */
66
+ Data_Get_Struct(self, struct _TidyDoc, tdoc);
67
+
68
+ tidyBufInit( &output );
69
+ tidyBufInit( &errbuf );
70
+
71
+ array = rb_ary_new();
72
+
73
+ status = tidySetErrorBuffer( tdoc, &errbuf );
74
+
75
+ access = rb_iv_get(self, "@access");
76
+ tidyOptSetInt( tdoc, TidyAccessibilityCheckLevel, NUM2UINT(access));
77
+
78
+ if (status >= 0) {
79
+
80
+ int is_input_source = 0;
81
+
82
+ is_input_source =
83
+ rb_respond_to(input, rb_intern("eof")) == Qtrue &&
84
+ rb_respond_to(input, rb_intern("getc")) == Qtrue &&
85
+ rb_respond_to(input, rb_intern("ungetc")) == Qtrue;
86
+
87
+ if (is_input_source != 0) {
88
+ TidyInputSource source;
89
+
90
+ tidyInitSource(&source, (void *)&input,
91
+ (TidyGetByteFunc)rb_tidyGetByte,
92
+ (TidyUngetByteFunc)rb_tidyUngetByte,
93
+ (TidyEOFFunc)rb_tidyIsEOF);
94
+
95
+ status = tidyParseSource(tdoc, &source);
96
+ } else {
97
+ status = tidyParseString(tdoc, StringValuePtr(input));
98
+ }
99
+ }
100
+
101
+ if (status >= 0)
102
+ status = tidyCleanAndRepair( tdoc );
103
+ if (status >= 0)
104
+ status = tidyRunDiagnostics( tdoc );
105
+ if (status >= 0)
106
+ tidyErrorSummary( tdoc );
107
+ if (status >= 0)
108
+ tidyGeneralInfo( tdoc );
109
+
110
+ if (status >= 0)
111
+ status = tidySaveBuffer( tdoc, &output );
112
+
113
+ contentErrors += tidyErrorCount( tdoc );
114
+ contentWarnings += tidyWarningCount( tdoc );
115
+ accessWarnings += tidyAccessWarningCount( tdoc );
116
+
117
+ if (contentErrors > 0 || contentWarnings > 0) {
118
+ errors = rb_str_new2(errbuf.bp);
119
+ } else {
120
+ errors = rb_ary_new2("");
121
+ }
122
+
123
+ rb_iv_set(self, "@errors", errors);
124
+
125
+ rb_ary_store(array, 0, errors);
126
+ rb_ary_store(array, 1, rb_str_new2(output.bp));
127
+
128
+ tidyBufFree( &output );
129
+ tidyBufFree( &errbuf );
130
+
131
+ return array;
132
+ }
133
+
134
+ static VALUE rb_tidy_init(VALUE self)
135
+ {
136
+ VALUE access = INT2NUM(4);
137
+ VALUE errors = rb_ary_new();
138
+
139
+ rb_iv_set(self, "@access", access);
140
+ rb_iv_set(self, "@errors", errors);
141
+
142
+ return self;
143
+ }
144
+
145
+ static VALUE rb_tidy_open(VALUE class, VALUE hash)
146
+ {
147
+ VALUE tidy = rb_tidy_new(class, hash);
148
+
149
+ if (rb_block_given_p()) {
150
+ rb_yield(tidy);
151
+ }
152
+
153
+ return tidy;
154
+ }
155
+
156
+ static VALUE rb_tidy_clean(VALUE self, VALUE input)
157
+ {
158
+ VALUE array;
159
+
160
+ array = rb_tidy_parse(self, input);
161
+
162
+ return rb_ary_entry(array, 1);
163
+ }
164
+
165
+ static VALUE rb_tidy_path_get(VALUE self)
166
+ {
167
+ VALUE path;
168
+ path = rb_cv_get(self, "@@path");
169
+ return path;
170
+ }
171
+
172
+ static VALUE rb_tidy_path_set(VALUE self, VALUE path)
173
+ {
174
+ rb_cv_set(self, "@@path", path);
175
+ return Qnil;
176
+ }
177
+
178
+ void Init_tidy()
179
+ {
180
+ cTidy = rb_define_class("Tidy", rb_cObject);
181
+
182
+ rb_define_class_variable(cTidy, "@@path", rb_str_new2("tidy-is-built-in"));
183
+
184
+ rb_define_singleton_method(cTidy, "new", rb_tidy_new, 0);
185
+ rb_define_singleton_method(cTidy, "open", rb_tidy_open, 1);
186
+ rb_define_singleton_method(cTidy, "path", rb_tidy_path_get, 0);
187
+ rb_define_singleton_method(cTidy, "path=", rb_tidy_path_set, 1);
188
+
189
+ rb_define_method(cTidy, "parse", rb_tidy_parse, 1);
190
+ rb_define_method(cTidy, "initialize", rb_tidy_init, 0);
191
+ rb_define_method(cTidy, "clean", rb_tidy_clean, 1);
192
+
193
+ rb_define_attr(cTidy, "access", 1, 1);
194
+ rb_define_attr(cTidy, "errors", 1, 0);
195
+ }