tidy-ext 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +4 -0
- data/LICENSE +50 -0
- data/README +12 -0
- data/Rakefile +60 -0
- data/VERSION +1 -0
- data/ext/tidy/access.c +3310 -0
- data/ext/tidy/access.h +279 -0
- data/ext/tidy/alloc.c +107 -0
- data/ext/tidy/attrask.c +209 -0
- data/ext/tidy/attrdict.c +2398 -0
- data/ext/tidy/attrdict.h +122 -0
- data/ext/tidy/attrget.c +213 -0
- data/ext/tidy/attrs.c +1911 -0
- data/ext/tidy/attrs.h +374 -0
- data/ext/tidy/buffio.c +232 -0
- data/ext/tidy/buffio.h +118 -0
- data/ext/tidy/charsets.c +1032 -0
- data/ext/tidy/charsets.h +14 -0
- data/ext/tidy/clean.c +2674 -0
- data/ext/tidy/clean.h +87 -0
- data/ext/tidy/config.c +1746 -0
- data/ext/tidy/config.h +153 -0
- data/ext/tidy/entities.c +419 -0
- data/ext/tidy/entities.h +24 -0
- data/ext/tidy/extconf.rb +5 -0
- data/ext/tidy/fileio.c +106 -0
- data/ext/tidy/fileio.h +46 -0
- data/ext/tidy/forward.h +69 -0
- data/ext/tidy/iconvtc.c +105 -0
- data/ext/tidy/iconvtc.h +15 -0
- data/ext/tidy/istack.c +373 -0
- data/ext/tidy/lexer.c +3825 -0
- data/ext/tidy/lexer.h +617 -0
- data/ext/tidy/localize.c +1882 -0
- data/ext/tidy/mappedio.c +329 -0
- data/ext/tidy/mappedio.h +16 -0
- data/ext/tidy/message.h +207 -0
- data/ext/tidy/parser.c +4408 -0
- data/ext/tidy/parser.h +76 -0
- data/ext/tidy/platform.h +636 -0
- data/ext/tidy/pprint.c +2276 -0
- data/ext/tidy/pprint.h +93 -0
- data/ext/tidy/ruby-tidy.c +195 -0
- data/ext/tidy/streamio.c +1407 -0
- data/ext/tidy/streamio.h +222 -0
- data/ext/tidy/tagask.c +286 -0
- data/ext/tidy/tags.c +955 -0
- data/ext/tidy/tags.h +235 -0
- data/ext/tidy/tidy-int.h +129 -0
- data/ext/tidy/tidy.h +1097 -0
- data/ext/tidy/tidyenum.h +622 -0
- data/ext/tidy/tidylib.c +1751 -0
- data/ext/tidy/tmbstr.c +306 -0
- data/ext/tidy/tmbstr.h +92 -0
- data/ext/tidy/utf8.c +539 -0
- data/ext/tidy/utf8.h +52 -0
- data/ext/tidy/version.h +14 -0
- data/ext/tidy/win32tc.c +795 -0
- data/ext/tidy/win32tc.h +19 -0
- data/spec/spec_helper.rb +5 -0
- data/spec/tidy/compat_spec.rb +44 -0
- data/spec/tidy/remote_uri_spec.rb +14 -0
- data/spec/tidy/test1.html +5 -0
- data/spec/tidy/tidy_spec.rb +34 -0
- metadata +125 -0
data/ext/tidy/pprint.h
ADDED
@@ -0,0 +1,93 @@
|
|
1
|
+
#ifndef __PPRINT_H__
|
2
|
+
#define __PPRINT_H__
|
3
|
+
|
4
|
+
/* pprint.h -- pretty print parse tree
|
5
|
+
|
6
|
+
(c) 1998-2007 (W3C) MIT, ERCIM, Keio University
|
7
|
+
See tidy.h for the copyright notice.
|
8
|
+
|
9
|
+
CVS Info:
|
10
|
+
$Author: arnaud02 $
|
11
|
+
$Date: 2007/02/11 09:45:08 $
|
12
|
+
$Revision: 1.9 $
|
13
|
+
|
14
|
+
*/
|
15
|
+
|
16
|
+
#include "forward.h"
|
17
|
+
|
18
|
+
/*
|
19
|
+
Block-level and unknown elements are printed on
|
20
|
+
new lines and their contents indented 2 spaces
|
21
|
+
|
22
|
+
Inline elements are printed inline.
|
23
|
+
|
24
|
+
Inline content is wrapped on spaces (except in
|
25
|
+
attribute values or preformatted text, after
|
26
|
+
start tags and before end tags
|
27
|
+
*/
|
28
|
+
|
29
|
+
#define NORMAL 0u
|
30
|
+
#define PREFORMATTED 1u
|
31
|
+
#define COMMENT 2u
|
32
|
+
#define ATTRIBVALUE 4u
|
33
|
+
#define NOWRAP 8u
|
34
|
+
#define CDATA 16u
|
35
|
+
|
36
|
+
|
37
|
+
/* The pretty printer keeps at most two lines of text in the
|
38
|
+
** buffer before flushing output. We need to capture the
|
39
|
+
** indent state (indent level) at the _beginning_ of _each_
|
40
|
+
** line, not the end of just the second line.
|
41
|
+
**
|
42
|
+
** We must also keep track "In Attribute" and "In String"
|
43
|
+
** states at the _end_ of each line,
|
44
|
+
*/
|
45
|
+
|
46
|
+
typedef struct _TidyIndent
|
47
|
+
{
|
48
|
+
int spaces;
|
49
|
+
int attrValStart;
|
50
|
+
int attrStringStart;
|
51
|
+
} TidyIndent;
|
52
|
+
|
53
|
+
typedef struct _TidyPrintImpl
|
54
|
+
{
|
55
|
+
TidyAllocator *allocator; /* Allocator */
|
56
|
+
|
57
|
+
uint *linebuf;
|
58
|
+
uint lbufsize;
|
59
|
+
uint linelen;
|
60
|
+
uint wraphere;
|
61
|
+
|
62
|
+
uint ixInd;
|
63
|
+
TidyIndent indent[2]; /* Two lines worth of indent state */
|
64
|
+
} TidyPrintImpl;
|
65
|
+
|
66
|
+
|
67
|
+
#if 0 && SUPPORT_ASIAN_ENCODINGS
|
68
|
+
/* #431953 - start RJ Wraplen adjusted for smooth international ride */
|
69
|
+
uint CWrapLen( TidyDocImpl* doc, uint ind );
|
70
|
+
#endif
|
71
|
+
|
72
|
+
void TY_(InitPrintBuf)( TidyDocImpl* doc );
|
73
|
+
void TY_(FreePrintBuf)( TidyDocImpl* doc );
|
74
|
+
|
75
|
+
void TY_(PFlushLine)( TidyDocImpl* doc, uint indent );
|
76
|
+
|
77
|
+
|
78
|
+
/* print just the content of the body element.
|
79
|
+
** useful when you want to reuse material from
|
80
|
+
** other documents.
|
81
|
+
**
|
82
|
+
** -- Sebastiano Vigna <vigna@dsi.unimi.it>
|
83
|
+
*/
|
84
|
+
|
85
|
+
void TY_(PrintBody)( TidyDocImpl* doc ); /* you can print an entire document */
|
86
|
+
/* node as body using PPrintTree() */
|
87
|
+
|
88
|
+
void TY_(PPrintTree)( TidyDocImpl* doc, uint mode, uint indent, Node *node );
|
89
|
+
|
90
|
+
void TY_(PPrintXMLTree)( TidyDocImpl* doc, uint mode, uint indent, Node *node );
|
91
|
+
|
92
|
+
|
93
|
+
#endif /* __PPRINT_H__ */
|
@@ -0,0 +1,195 @@
|
|
1
|
+
/*
|
2
|
+
ruby-tidy.c - Ruby driver for HTML TidyLib
|
3
|
+
*/
|
4
|
+
#include <ruby.h>
|
5
|
+
#include "tidy.h"
|
6
|
+
#include "buffio.h"
|
7
|
+
|
8
|
+
int TIDY_CALL rb_tidyGetByte( TidyInputSource* source )
|
9
|
+
{
|
10
|
+
VALUE data = (VALUE)source->sourceData;
|
11
|
+
VALUE value = rb_funcall(data, rb_intern("getc"), 0, NULL);
|
12
|
+
return NUM2INT(value);
|
13
|
+
}
|
14
|
+
|
15
|
+
void TIDY_CALL rb_tidyUngetByte( TidyInputSource* source, byte byteValue )
|
16
|
+
{
|
17
|
+
VALUE data = (VALUE)source->sourceData;
|
18
|
+
VALUE value = rb_funcall(data, rb_intern("ungetc"), 1, INT2NUM(byteValue));
|
19
|
+
}
|
20
|
+
|
21
|
+
Bool TIDY_CALL rb_tidyIsEOF( TidyInputSource* source )
|
22
|
+
{
|
23
|
+
VALUE data = (VALUE)source->sourceData;
|
24
|
+
VALUE value = rb_funcall(data, rb_intern("eof"), 0, NULL);
|
25
|
+
return value == Qfalse ? no : yes;
|
26
|
+
}
|
27
|
+
|
28
|
+
static VALUE cTidy;
|
29
|
+
|
30
|
+
/* release tidy doc memory */
|
31
|
+
static void rb_tidy_free(void *ptr)
|
32
|
+
{
|
33
|
+
tidyRelease(ptr);
|
34
|
+
}
|
35
|
+
|
36
|
+
/* create a new tidy doc */
|
37
|
+
/* TODO, observe :show_warnings=>true in hash */
|
38
|
+
static VALUE rb_tidy_new(VALUE class, VALUE hash)
|
39
|
+
{
|
40
|
+
VALUE argv[1];
|
41
|
+
TidyDoc tdoc = tidyCreate();
|
42
|
+
VALUE tdata = Data_Wrap_Struct(class, 0, rb_tidy_free, (struct _TidyDoc *)tdoc);
|
43
|
+
argv[0] = hash;
|
44
|
+
|
45
|
+
rb_obj_call_init(tdata, 0, NULL);
|
46
|
+
return tdata;
|
47
|
+
}
|
48
|
+
|
49
|
+
/* parse the given input and return the tidy errors and output */
|
50
|
+
static VALUE rb_tidy_parse(VALUE self, VALUE input)
|
51
|
+
{
|
52
|
+
VALUE array;
|
53
|
+
VALUE access;
|
54
|
+
VALUE errors;
|
55
|
+
|
56
|
+
TidyDoc tdoc;
|
57
|
+
TidyBuffer output;
|
58
|
+
TidyBuffer errbuf;
|
59
|
+
int status = 0;
|
60
|
+
|
61
|
+
int contentErrors = 0;
|
62
|
+
int contentWarnings = 0;
|
63
|
+
int accessWarnings = 0;
|
64
|
+
|
65
|
+
/* See platform.h, opaque_type for typedef convention */
|
66
|
+
Data_Get_Struct(self, struct _TidyDoc, tdoc);
|
67
|
+
|
68
|
+
tidyBufInit( &output );
|
69
|
+
tidyBufInit( &errbuf );
|
70
|
+
|
71
|
+
array = rb_ary_new();
|
72
|
+
|
73
|
+
status = tidySetErrorBuffer( tdoc, &errbuf );
|
74
|
+
|
75
|
+
access = rb_iv_get(self, "@access");
|
76
|
+
tidyOptSetInt( tdoc, TidyAccessibilityCheckLevel, NUM2UINT(access));
|
77
|
+
|
78
|
+
if (status >= 0) {
|
79
|
+
|
80
|
+
int is_input_source = 0;
|
81
|
+
|
82
|
+
is_input_source =
|
83
|
+
rb_respond_to(input, rb_intern("eof")) == Qtrue &&
|
84
|
+
rb_respond_to(input, rb_intern("getc")) == Qtrue &&
|
85
|
+
rb_respond_to(input, rb_intern("ungetc")) == Qtrue;
|
86
|
+
|
87
|
+
if (is_input_source != 0) {
|
88
|
+
TidyInputSource source;
|
89
|
+
|
90
|
+
tidyInitSource(&source, (void *)&input,
|
91
|
+
(TidyGetByteFunc)rb_tidyGetByte,
|
92
|
+
(TidyUngetByteFunc)rb_tidyUngetByte,
|
93
|
+
(TidyEOFFunc)rb_tidyIsEOF);
|
94
|
+
|
95
|
+
status = tidyParseSource(tdoc, &source);
|
96
|
+
} else {
|
97
|
+
status = tidyParseString(tdoc, StringValuePtr(input));
|
98
|
+
}
|
99
|
+
}
|
100
|
+
|
101
|
+
if (status >= 0)
|
102
|
+
status = tidyCleanAndRepair( tdoc );
|
103
|
+
if (status >= 0)
|
104
|
+
status = tidyRunDiagnostics( tdoc );
|
105
|
+
if (status >= 0)
|
106
|
+
tidyErrorSummary( tdoc );
|
107
|
+
if (status >= 0)
|
108
|
+
tidyGeneralInfo( tdoc );
|
109
|
+
|
110
|
+
if (status >= 0)
|
111
|
+
status = tidySaveBuffer( tdoc, &output );
|
112
|
+
|
113
|
+
contentErrors += tidyErrorCount( tdoc );
|
114
|
+
contentWarnings += tidyWarningCount( tdoc );
|
115
|
+
accessWarnings += tidyAccessWarningCount( tdoc );
|
116
|
+
|
117
|
+
if (contentErrors > 0 || contentWarnings > 0) {
|
118
|
+
errors = rb_str_new2(errbuf.bp);
|
119
|
+
} else {
|
120
|
+
errors = rb_ary_new2("");
|
121
|
+
}
|
122
|
+
|
123
|
+
rb_iv_set(self, "@errors", errors);
|
124
|
+
|
125
|
+
rb_ary_store(array, 0, errors);
|
126
|
+
rb_ary_store(array, 1, rb_str_new2(output.bp));
|
127
|
+
|
128
|
+
tidyBufFree( &output );
|
129
|
+
tidyBufFree( &errbuf );
|
130
|
+
|
131
|
+
return array;
|
132
|
+
}
|
133
|
+
|
134
|
+
static VALUE rb_tidy_init(VALUE self)
|
135
|
+
{
|
136
|
+
VALUE access = INT2NUM(4);
|
137
|
+
VALUE errors = rb_ary_new();
|
138
|
+
|
139
|
+
rb_iv_set(self, "@access", access);
|
140
|
+
rb_iv_set(self, "@errors", errors);
|
141
|
+
|
142
|
+
return self;
|
143
|
+
}
|
144
|
+
|
145
|
+
static VALUE rb_tidy_open(VALUE class, VALUE hash)
|
146
|
+
{
|
147
|
+
VALUE tidy = rb_tidy_new(class, hash);
|
148
|
+
|
149
|
+
if (rb_block_given_p()) {
|
150
|
+
rb_yield(tidy);
|
151
|
+
}
|
152
|
+
|
153
|
+
return tidy;
|
154
|
+
}
|
155
|
+
|
156
|
+
static VALUE rb_tidy_clean(VALUE self, VALUE input)
|
157
|
+
{
|
158
|
+
VALUE array;
|
159
|
+
|
160
|
+
array = rb_tidy_parse(self, input);
|
161
|
+
|
162
|
+
return rb_ary_entry(array, 1);
|
163
|
+
}
|
164
|
+
|
165
|
+
static VALUE rb_tidy_path_get(VALUE self)
|
166
|
+
{
|
167
|
+
VALUE path;
|
168
|
+
path = rb_cv_get(self, "@@path");
|
169
|
+
return path;
|
170
|
+
}
|
171
|
+
|
172
|
+
static VALUE rb_tidy_path_set(VALUE self, VALUE path)
|
173
|
+
{
|
174
|
+
rb_cv_set(self, "@@path", path);
|
175
|
+
return Qnil;
|
176
|
+
}
|
177
|
+
|
178
|
+
void Init_tidy()
|
179
|
+
{
|
180
|
+
cTidy = rb_define_class("Tidy", rb_cObject);
|
181
|
+
|
182
|
+
rb_define_class_variable(cTidy, "@@path", rb_str_new2("tidy-is-built-in"));
|
183
|
+
|
184
|
+
rb_define_singleton_method(cTidy, "new", rb_tidy_new, 0);
|
185
|
+
rb_define_singleton_method(cTidy, "open", rb_tidy_open, 1);
|
186
|
+
rb_define_singleton_method(cTidy, "path", rb_tidy_path_get, 0);
|
187
|
+
rb_define_singleton_method(cTidy, "path=", rb_tidy_path_set, 1);
|
188
|
+
|
189
|
+
rb_define_method(cTidy, "parse", rb_tidy_parse, 1);
|
190
|
+
rb_define_method(cTidy, "initialize", rb_tidy_init, 0);
|
191
|
+
rb_define_method(cTidy, "clean", rb_tidy_clean, 1);
|
192
|
+
|
193
|
+
rb_define_attr(cTidy, "access", 1, 1);
|
194
|
+
rb_define_attr(cTidy, "errors", 1, 0);
|
195
|
+
}
|