tidy-ext 0.1.7
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +4 -0
- data/LICENSE +50 -0
- data/README +12 -0
- data/Rakefile +60 -0
- data/VERSION +1 -0
- data/ext/tidy/access.c +3310 -0
- data/ext/tidy/access.h +279 -0
- data/ext/tidy/alloc.c +107 -0
- data/ext/tidy/attrask.c +209 -0
- data/ext/tidy/attrdict.c +2398 -0
- data/ext/tidy/attrdict.h +122 -0
- data/ext/tidy/attrget.c +213 -0
- data/ext/tidy/attrs.c +1911 -0
- data/ext/tidy/attrs.h +374 -0
- data/ext/tidy/buffio.c +232 -0
- data/ext/tidy/buffio.h +118 -0
- data/ext/tidy/charsets.c +1032 -0
- data/ext/tidy/charsets.h +14 -0
- data/ext/tidy/clean.c +2674 -0
- data/ext/tidy/clean.h +87 -0
- data/ext/tidy/config.c +1746 -0
- data/ext/tidy/config.h +153 -0
- data/ext/tidy/entities.c +419 -0
- data/ext/tidy/entities.h +24 -0
- data/ext/tidy/extconf.rb +5 -0
- data/ext/tidy/fileio.c +106 -0
- data/ext/tidy/fileio.h +46 -0
- data/ext/tidy/forward.h +69 -0
- data/ext/tidy/iconvtc.c +105 -0
- data/ext/tidy/iconvtc.h +15 -0
- data/ext/tidy/istack.c +373 -0
- data/ext/tidy/lexer.c +3825 -0
- data/ext/tidy/lexer.h +617 -0
- data/ext/tidy/localize.c +1882 -0
- data/ext/tidy/mappedio.c +329 -0
- data/ext/tidy/mappedio.h +16 -0
- data/ext/tidy/message.h +207 -0
- data/ext/tidy/parser.c +4408 -0
- data/ext/tidy/parser.h +76 -0
- data/ext/tidy/platform.h +636 -0
- data/ext/tidy/pprint.c +2276 -0
- data/ext/tidy/pprint.h +93 -0
- data/ext/tidy/ruby-tidy.c +195 -0
- data/ext/tidy/streamio.c +1407 -0
- data/ext/tidy/streamio.h +222 -0
- data/ext/tidy/tagask.c +286 -0
- data/ext/tidy/tags.c +955 -0
- data/ext/tidy/tags.h +235 -0
- data/ext/tidy/tidy-int.h +129 -0
- data/ext/tidy/tidy.h +1097 -0
- data/ext/tidy/tidyenum.h +622 -0
- data/ext/tidy/tidylib.c +1751 -0
- data/ext/tidy/tmbstr.c +306 -0
- data/ext/tidy/tmbstr.h +92 -0
- data/ext/tidy/utf8.c +539 -0
- data/ext/tidy/utf8.h +52 -0
- data/ext/tidy/version.h +14 -0
- data/ext/tidy/win32tc.c +795 -0
- data/ext/tidy/win32tc.h +19 -0
- data/spec/spec_helper.rb +5 -0
- data/spec/tidy/compat_spec.rb +44 -0
- data/spec/tidy/remote_uri_spec.rb +14 -0
- data/spec/tidy/test1.html +5 -0
- data/spec/tidy/tidy_spec.rb +34 -0
- metadata +125 -0
data/ext/tidy/pprint.h
ADDED
@@ -0,0 +1,93 @@
|
|
1
|
+
#ifndef __PPRINT_H__
|
2
|
+
#define __PPRINT_H__
|
3
|
+
|
4
|
+
/* pprint.h -- pretty print parse tree
|
5
|
+
|
6
|
+
(c) 1998-2007 (W3C) MIT, ERCIM, Keio University
|
7
|
+
See tidy.h for the copyright notice.
|
8
|
+
|
9
|
+
CVS Info:
|
10
|
+
$Author: arnaud02 $
|
11
|
+
$Date: 2007/02/11 09:45:08 $
|
12
|
+
$Revision: 1.9 $
|
13
|
+
|
14
|
+
*/
|
15
|
+
|
16
|
+
#include "forward.h"
|
17
|
+
|
18
|
+
/*
|
19
|
+
Block-level and unknown elements are printed on
|
20
|
+
new lines and their contents indented 2 spaces
|
21
|
+
|
22
|
+
Inline elements are printed inline.
|
23
|
+
|
24
|
+
Inline content is wrapped on spaces (except in
|
25
|
+
attribute values or preformatted text, after
|
26
|
+
start tags and before end tags
|
27
|
+
*/
|
28
|
+
|
29
|
+
#define NORMAL 0u
|
30
|
+
#define PREFORMATTED 1u
|
31
|
+
#define COMMENT 2u
|
32
|
+
#define ATTRIBVALUE 4u
|
33
|
+
#define NOWRAP 8u
|
34
|
+
#define CDATA 16u
|
35
|
+
|
36
|
+
|
37
|
+
/* The pretty printer keeps at most two lines of text in the
|
38
|
+
** buffer before flushing output. We need to capture the
|
39
|
+
** indent state (indent level) at the _beginning_ of _each_
|
40
|
+
** line, not the end of just the second line.
|
41
|
+
**
|
42
|
+
** We must also keep track "In Attribute" and "In String"
|
43
|
+
** states at the _end_ of each line,
|
44
|
+
*/
|
45
|
+
|
46
|
+
typedef struct _TidyIndent
|
47
|
+
{
|
48
|
+
int spaces;
|
49
|
+
int attrValStart;
|
50
|
+
int attrStringStart;
|
51
|
+
} TidyIndent;
|
52
|
+
|
53
|
+
typedef struct _TidyPrintImpl
|
54
|
+
{
|
55
|
+
TidyAllocator *allocator; /* Allocator */
|
56
|
+
|
57
|
+
uint *linebuf;
|
58
|
+
uint lbufsize;
|
59
|
+
uint linelen;
|
60
|
+
uint wraphere;
|
61
|
+
|
62
|
+
uint ixInd;
|
63
|
+
TidyIndent indent[2]; /* Two lines worth of indent state */
|
64
|
+
} TidyPrintImpl;
|
65
|
+
|
66
|
+
|
67
|
+
#if 0 && SUPPORT_ASIAN_ENCODINGS
|
68
|
+
/* #431953 - start RJ Wraplen adjusted for smooth international ride */
|
69
|
+
uint CWrapLen( TidyDocImpl* doc, uint ind );
|
70
|
+
#endif
|
71
|
+
|
72
|
+
void TY_(InitPrintBuf)( TidyDocImpl* doc );
|
73
|
+
void TY_(FreePrintBuf)( TidyDocImpl* doc );
|
74
|
+
|
75
|
+
void TY_(PFlushLine)( TidyDocImpl* doc, uint indent );
|
76
|
+
|
77
|
+
|
78
|
+
/* print just the content of the body element.
|
79
|
+
** useful when you want to reuse material from
|
80
|
+
** other documents.
|
81
|
+
**
|
82
|
+
** -- Sebastiano Vigna <vigna@dsi.unimi.it>
|
83
|
+
*/
|
84
|
+
|
85
|
+
void TY_(PrintBody)( TidyDocImpl* doc ); /* you can print an entire document */
|
86
|
+
/* node as body using PPrintTree() */
|
87
|
+
|
88
|
+
void TY_(PPrintTree)( TidyDocImpl* doc, uint mode, uint indent, Node *node );
|
89
|
+
|
90
|
+
void TY_(PPrintXMLTree)( TidyDocImpl* doc, uint mode, uint indent, Node *node );
|
91
|
+
|
92
|
+
|
93
|
+
#endif /* __PPRINT_H__ */
|
@@ -0,0 +1,195 @@
|
|
1
|
+
/*
|
2
|
+
ruby-tidy.c - Ruby driver for HTML TidyLib
|
3
|
+
*/
|
4
|
+
#include <ruby.h>
|
5
|
+
#include "tidy.h"
|
6
|
+
#include "buffio.h"
|
7
|
+
|
8
|
+
int TIDY_CALL rb_tidyGetByte( TidyInputSource* source )
|
9
|
+
{
|
10
|
+
VALUE data = (VALUE)source->sourceData;
|
11
|
+
VALUE value = rb_funcall(data, rb_intern("getc"), 0, NULL);
|
12
|
+
return NUM2INT(value);
|
13
|
+
}
|
14
|
+
|
15
|
+
void TIDY_CALL rb_tidyUngetByte( TidyInputSource* source, byte byteValue )
|
16
|
+
{
|
17
|
+
VALUE data = (VALUE)source->sourceData;
|
18
|
+
VALUE value = rb_funcall(data, rb_intern("ungetc"), 1, INT2NUM(byteValue));
|
19
|
+
}
|
20
|
+
|
21
|
+
Bool TIDY_CALL rb_tidyIsEOF( TidyInputSource* source )
|
22
|
+
{
|
23
|
+
VALUE data = (VALUE)source->sourceData;
|
24
|
+
VALUE value = rb_funcall(data, rb_intern("eof"), 0, NULL);
|
25
|
+
return value == Qfalse ? no : yes;
|
26
|
+
}
|
27
|
+
|
28
|
+
static VALUE cTidy;
|
29
|
+
|
30
|
+
/* release tidy doc memory */
|
31
|
+
static void rb_tidy_free(void *ptr)
|
32
|
+
{
|
33
|
+
tidyRelease(ptr);
|
34
|
+
}
|
35
|
+
|
36
|
+
/* create a new tidy doc */
|
37
|
+
/* TODO, observe :show_warnings=>true in hash */
|
38
|
+
static VALUE rb_tidy_new(VALUE class, VALUE hash)
|
39
|
+
{
|
40
|
+
VALUE argv[1];
|
41
|
+
TidyDoc tdoc = tidyCreate();
|
42
|
+
VALUE tdata = Data_Wrap_Struct(class, 0, rb_tidy_free, (struct _TidyDoc *)tdoc);
|
43
|
+
argv[0] = hash;
|
44
|
+
|
45
|
+
rb_obj_call_init(tdata, 0, NULL);
|
46
|
+
return tdata;
|
47
|
+
}
|
48
|
+
|
49
|
+
/* parse the given input and return the tidy errors and output */
|
50
|
+
static VALUE rb_tidy_parse(VALUE self, VALUE input)
|
51
|
+
{
|
52
|
+
VALUE array;
|
53
|
+
VALUE access;
|
54
|
+
VALUE errors;
|
55
|
+
|
56
|
+
TidyDoc tdoc;
|
57
|
+
TidyBuffer output;
|
58
|
+
TidyBuffer errbuf;
|
59
|
+
int status = 0;
|
60
|
+
|
61
|
+
int contentErrors = 0;
|
62
|
+
int contentWarnings = 0;
|
63
|
+
int accessWarnings = 0;
|
64
|
+
|
65
|
+
/* See platform.h, opaque_type for typedef convention */
|
66
|
+
Data_Get_Struct(self, struct _TidyDoc, tdoc);
|
67
|
+
|
68
|
+
tidyBufInit( &output );
|
69
|
+
tidyBufInit( &errbuf );
|
70
|
+
|
71
|
+
array = rb_ary_new();
|
72
|
+
|
73
|
+
status = tidySetErrorBuffer( tdoc, &errbuf );
|
74
|
+
|
75
|
+
access = rb_iv_get(self, "@access");
|
76
|
+
tidyOptSetInt( tdoc, TidyAccessibilityCheckLevel, NUM2UINT(access));
|
77
|
+
|
78
|
+
if (status >= 0) {
|
79
|
+
|
80
|
+
int is_input_source = 0;
|
81
|
+
|
82
|
+
is_input_source =
|
83
|
+
rb_respond_to(input, rb_intern("eof")) == Qtrue &&
|
84
|
+
rb_respond_to(input, rb_intern("getc")) == Qtrue &&
|
85
|
+
rb_respond_to(input, rb_intern("ungetc")) == Qtrue;
|
86
|
+
|
87
|
+
if (is_input_source != 0) {
|
88
|
+
TidyInputSource source;
|
89
|
+
|
90
|
+
tidyInitSource(&source, (void *)&input,
|
91
|
+
(TidyGetByteFunc)rb_tidyGetByte,
|
92
|
+
(TidyUngetByteFunc)rb_tidyUngetByte,
|
93
|
+
(TidyEOFFunc)rb_tidyIsEOF);
|
94
|
+
|
95
|
+
status = tidyParseSource(tdoc, &source);
|
96
|
+
} else {
|
97
|
+
status = tidyParseString(tdoc, StringValuePtr(input));
|
98
|
+
}
|
99
|
+
}
|
100
|
+
|
101
|
+
if (status >= 0)
|
102
|
+
status = tidyCleanAndRepair( tdoc );
|
103
|
+
if (status >= 0)
|
104
|
+
status = tidyRunDiagnostics( tdoc );
|
105
|
+
if (status >= 0)
|
106
|
+
tidyErrorSummary( tdoc );
|
107
|
+
if (status >= 0)
|
108
|
+
tidyGeneralInfo( tdoc );
|
109
|
+
|
110
|
+
if (status >= 0)
|
111
|
+
status = tidySaveBuffer( tdoc, &output );
|
112
|
+
|
113
|
+
contentErrors += tidyErrorCount( tdoc );
|
114
|
+
contentWarnings += tidyWarningCount( tdoc );
|
115
|
+
accessWarnings += tidyAccessWarningCount( tdoc );
|
116
|
+
|
117
|
+
if (contentErrors > 0 || contentWarnings > 0) {
|
118
|
+
errors = rb_str_new2(errbuf.bp);
|
119
|
+
} else {
|
120
|
+
errors = rb_ary_new2("");
|
121
|
+
}
|
122
|
+
|
123
|
+
rb_iv_set(self, "@errors", errors);
|
124
|
+
|
125
|
+
rb_ary_store(array, 0, errors);
|
126
|
+
rb_ary_store(array, 1, rb_str_new2(output.bp));
|
127
|
+
|
128
|
+
tidyBufFree( &output );
|
129
|
+
tidyBufFree( &errbuf );
|
130
|
+
|
131
|
+
return array;
|
132
|
+
}
|
133
|
+
|
134
|
+
static VALUE rb_tidy_init(VALUE self)
|
135
|
+
{
|
136
|
+
VALUE access = INT2NUM(4);
|
137
|
+
VALUE errors = rb_ary_new();
|
138
|
+
|
139
|
+
rb_iv_set(self, "@access", access);
|
140
|
+
rb_iv_set(self, "@errors", errors);
|
141
|
+
|
142
|
+
return self;
|
143
|
+
}
|
144
|
+
|
145
|
+
static VALUE rb_tidy_open(VALUE class, VALUE hash)
|
146
|
+
{
|
147
|
+
VALUE tidy = rb_tidy_new(class, hash);
|
148
|
+
|
149
|
+
if (rb_block_given_p()) {
|
150
|
+
rb_yield(tidy);
|
151
|
+
}
|
152
|
+
|
153
|
+
return tidy;
|
154
|
+
}
|
155
|
+
|
156
|
+
static VALUE rb_tidy_clean(VALUE self, VALUE input)
|
157
|
+
{
|
158
|
+
VALUE array;
|
159
|
+
|
160
|
+
array = rb_tidy_parse(self, input);
|
161
|
+
|
162
|
+
return rb_ary_entry(array, 1);
|
163
|
+
}
|
164
|
+
|
165
|
+
static VALUE rb_tidy_path_get(VALUE self)
|
166
|
+
{
|
167
|
+
VALUE path;
|
168
|
+
path = rb_cv_get(self, "@@path");
|
169
|
+
return path;
|
170
|
+
}
|
171
|
+
|
172
|
+
static VALUE rb_tidy_path_set(VALUE self, VALUE path)
|
173
|
+
{
|
174
|
+
rb_cv_set(self, "@@path", path);
|
175
|
+
return Qnil;
|
176
|
+
}
|
177
|
+
|
178
|
+
void Init_tidy()
|
179
|
+
{
|
180
|
+
cTidy = rb_define_class("Tidy", rb_cObject);
|
181
|
+
|
182
|
+
rb_define_class_variable(cTidy, "@@path", rb_str_new2("tidy-is-built-in"));
|
183
|
+
|
184
|
+
rb_define_singleton_method(cTidy, "new", rb_tidy_new, 0);
|
185
|
+
rb_define_singleton_method(cTidy, "open", rb_tidy_open, 1);
|
186
|
+
rb_define_singleton_method(cTidy, "path", rb_tidy_path_get, 0);
|
187
|
+
rb_define_singleton_method(cTidy, "path=", rb_tidy_path_set, 1);
|
188
|
+
|
189
|
+
rb_define_method(cTidy, "parse", rb_tidy_parse, 1);
|
190
|
+
rb_define_method(cTidy, "initialize", rb_tidy_init, 0);
|
191
|
+
rb_define_method(cTidy, "clean", rb_tidy_clean, 1);
|
192
|
+
|
193
|
+
rb_define_attr(cTidy, "access", 1, 1);
|
194
|
+
rb_define_attr(cTidy, "errors", 1, 0);
|
195
|
+
}
|