selectolax 0.4.4__cp310-cp310-macosx_10_9_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- selectolax/__init__.py +8 -0
- selectolax/base.pxi +4 -0
- selectolax/lexbor/attrs.pxi +120 -0
- selectolax/lexbor/node.pxi +1112 -0
- selectolax/lexbor/node_remove.pxi +29 -0
- selectolax/lexbor/selection.pxi +215 -0
- selectolax/lexbor/util.pxi +20 -0
- selectolax/lexbor.c +53768 -0
- selectolax/lexbor.cpython-310-darwin.so +0 -0
- selectolax/lexbor.pxd +599 -0
- selectolax/lexbor.pyi +1248 -0
- selectolax/lexbor.pyx +677 -0
- selectolax/modest/node.pxi +991 -0
- selectolax/modest/selection.pxi +195 -0
- selectolax/modest/util.pxi +20 -0
- selectolax/parser.c +47848 -0
- selectolax/parser.cpython-310-darwin.so +0 -0
- selectolax/parser.pxd +578 -0
- selectolax/parser.pyi +770 -0
- selectolax/parser.pyx +443 -0
- selectolax/py.typed +0 -0
- selectolax/utils.pxi +117 -0
- selectolax-0.4.4.dist-info/METADATA +222 -0
- selectolax-0.4.4.dist-info/RECORD +27 -0
- selectolax-0.4.4.dist-info/WHEEL +6 -0
- selectolax-0.4.4.dist-info/licenses/LICENSE +10 -0
- selectolax-0.4.4.dist-info/top_level.txt +1 -0
|
Binary file
|
selectolax/parser.pxd
ADDED
|
@@ -0,0 +1,578 @@
|
|
|
1
|
+
|
|
2
|
+
cdef extern from "myhtml/myhtml.h" nogil:
|
|
3
|
+
ctypedef unsigned int mystatus_t
|
|
4
|
+
ctypedef struct myhtml_t
|
|
5
|
+
ctypedef size_t myhtml_tag_id_t
|
|
6
|
+
|
|
7
|
+
ctypedef struct myhtml_tree_t:
|
|
8
|
+
# not completed struct
|
|
9
|
+
myhtml_t* myhtml
|
|
10
|
+
myhtml_tree_node_t* document
|
|
11
|
+
myhtml_tree_node_t* node_html
|
|
12
|
+
|
|
13
|
+
ctypedef struct mchar_async_t
|
|
14
|
+
ctypedef struct mycore_string_t:
|
|
15
|
+
char* data
|
|
16
|
+
size_t size
|
|
17
|
+
size_t length
|
|
18
|
+
|
|
19
|
+
mchar_async_t *mchar
|
|
20
|
+
size_t node_idx
|
|
21
|
+
|
|
22
|
+
ctypedef struct mycore_string_raw_t:
|
|
23
|
+
char* data
|
|
24
|
+
size_t size
|
|
25
|
+
size_t length
|
|
26
|
+
|
|
27
|
+
myhtml_namespace ns
|
|
28
|
+
|
|
29
|
+
ctypedef enum myhtml_options:
|
|
30
|
+
MyHTML_OPTIONS_DEFAULT = 0x00
|
|
31
|
+
MyHTML_OPTIONS_PARSE_MODE_SINGLE = 0x01
|
|
32
|
+
MyHTML_OPTIONS_PARSE_MODE_ALL_IN_ONE = 0x02
|
|
33
|
+
MyHTML_OPTIONS_PARSE_MODE_SEPARATELY = 0x04
|
|
34
|
+
|
|
35
|
+
ctypedef struct myhtml_collection_t:
|
|
36
|
+
myhtml_tree_node_t **list
|
|
37
|
+
size_t size
|
|
38
|
+
size_t length
|
|
39
|
+
|
|
40
|
+
ctypedef struct myhtml_tree_node_t:
|
|
41
|
+
myhtml_tree_node_flags flags
|
|
42
|
+
|
|
43
|
+
myhtml_tag_id_t tag_id
|
|
44
|
+
myhtml_namespace ns
|
|
45
|
+
|
|
46
|
+
myhtml_tree_node_t* prev
|
|
47
|
+
myhtml_tree_node_t* next
|
|
48
|
+
myhtml_tree_node_t* child
|
|
49
|
+
myhtml_tree_node_t* parent
|
|
50
|
+
|
|
51
|
+
myhtml_tree_node_t* last_child
|
|
52
|
+
|
|
53
|
+
myhtml_token_node_t* token
|
|
54
|
+
void* data
|
|
55
|
+
|
|
56
|
+
myhtml_tree_t* tree
|
|
57
|
+
|
|
58
|
+
ctypedef enum myhtml_namespace:
|
|
59
|
+
MyHTML_NAMESPACE_UNDEF = 0x00
|
|
60
|
+
MyHTML_NAMESPACE_HTML = 0x01
|
|
61
|
+
MyHTML_NAMESPACE_MATHML = 0x02
|
|
62
|
+
MyHTML_NAMESPACE_SVG = 0x03
|
|
63
|
+
MyHTML_NAMESPACE_XLINK = 0x04
|
|
64
|
+
MyHTML_NAMESPACE_XML = 0x05
|
|
65
|
+
MyHTML_NAMESPACE_XMLNS = 0x06
|
|
66
|
+
MyHTML_NAMESPACE_ANY = 0x07
|
|
67
|
+
MyHTML_NAMESPACE_LAST_ENTRY = 0x07
|
|
68
|
+
|
|
69
|
+
ctypedef enum myhtml_tree_node_flags:
|
|
70
|
+
MyHTML_TREE_NODE_UNDEF = 0
|
|
71
|
+
MyHTML_TREE_NODE_PARSER_INSERTED = 1
|
|
72
|
+
MyHTML_TREE_NODE_BLOCKING = 2
|
|
73
|
+
|
|
74
|
+
ctypedef enum myhtml_token_type:
|
|
75
|
+
MyHTML_TOKEN_TYPE_OPEN = 0x000
|
|
76
|
+
MyHTML_TOKEN_TYPE_CLOSE = 0x001
|
|
77
|
+
MyHTML_TOKEN_TYPE_CLOSE_SELF = 0x002
|
|
78
|
+
MyHTML_TOKEN_TYPE_DONE = 0x004
|
|
79
|
+
MyHTML_TOKEN_TYPE_WHITESPACE = 0x008
|
|
80
|
+
MyHTML_TOKEN_TYPE_RCDATA = 0x010
|
|
81
|
+
MyHTML_TOKEN_TYPE_RAWTEXT = 0x020
|
|
82
|
+
MyHTML_TOKEN_TYPE_SCRIPT = 0x040
|
|
83
|
+
MyHTML_TOKEN_TYPE_PLAINTEXT = 0x080
|
|
84
|
+
MyHTML_TOKEN_TYPE_CDATA = 0x100
|
|
85
|
+
MyHTML_TOKEN_TYPE_DATA = 0x200
|
|
86
|
+
MyHTML_TOKEN_TYPE_COMMENT = 0x400
|
|
87
|
+
MyHTML_TOKEN_TYPE_NULL = 0x800
|
|
88
|
+
|
|
89
|
+
ctypedef enum myhtml_tags:
|
|
90
|
+
MyHTML_TAG__UNDEF = 0x000
|
|
91
|
+
MyHTML_TAG__TEXT = 0x001
|
|
92
|
+
MyHTML_TAG__COMMENT = 0x002
|
|
93
|
+
MyHTML_TAG__DOCTYPE = 0x003
|
|
94
|
+
MyHTML_TAG_A = 0x004
|
|
95
|
+
MyHTML_TAG_ABBR = 0x005
|
|
96
|
+
MyHTML_TAG_ACRONYM = 0x006
|
|
97
|
+
MyHTML_TAG_ADDRESS = 0x007
|
|
98
|
+
MyHTML_TAG_ANNOTATION_XML = 0x008
|
|
99
|
+
MyHTML_TAG_APPLET = 0x009
|
|
100
|
+
MyHTML_TAG_AREA = 0x00a
|
|
101
|
+
MyHTML_TAG_ARTICLE = 0x00b
|
|
102
|
+
MyHTML_TAG_ASIDE = 0x00c
|
|
103
|
+
MyHTML_TAG_AUDIO = 0x00d
|
|
104
|
+
MyHTML_TAG_B = 0x00e
|
|
105
|
+
MyHTML_TAG_BASE = 0x00f
|
|
106
|
+
MyHTML_TAG_BASEFONT = 0x010
|
|
107
|
+
MyHTML_TAG_BDI = 0x011
|
|
108
|
+
MyHTML_TAG_BDO = 0x012
|
|
109
|
+
MyHTML_TAG_BGSOUND = 0x013
|
|
110
|
+
MyHTML_TAG_BIG = 0x014
|
|
111
|
+
MyHTML_TAG_BLINK = 0x015
|
|
112
|
+
MyHTML_TAG_BLOCKQUOTE = 0x016
|
|
113
|
+
MyHTML_TAG_BODY = 0x017
|
|
114
|
+
MyHTML_TAG_BR = 0x018
|
|
115
|
+
MyHTML_TAG_BUTTON = 0x019
|
|
116
|
+
MyHTML_TAG_CANVAS = 0x01a
|
|
117
|
+
MyHTML_TAG_CAPTION = 0x01b
|
|
118
|
+
MyHTML_TAG_CENTER = 0x01c
|
|
119
|
+
MyHTML_TAG_CITE = 0x01d
|
|
120
|
+
MyHTML_TAG_CODE = 0x01e
|
|
121
|
+
MyHTML_TAG_COL = 0x01f
|
|
122
|
+
MyHTML_TAG_COLGROUP = 0x020
|
|
123
|
+
MyHTML_TAG_COMMAND = 0x021
|
|
124
|
+
MyHTML_TAG_COMMENT = 0x022
|
|
125
|
+
MyHTML_TAG_DATALIST = 0x023
|
|
126
|
+
MyHTML_TAG_DD = 0x024
|
|
127
|
+
MyHTML_TAG_DEL = 0x025
|
|
128
|
+
MyHTML_TAG_DETAILS = 0x026
|
|
129
|
+
MyHTML_TAG_DFN = 0x027
|
|
130
|
+
MyHTML_TAG_DIALOG = 0x028
|
|
131
|
+
MyHTML_TAG_DIR = 0x029
|
|
132
|
+
MyHTML_TAG_DIV = 0x02a
|
|
133
|
+
MyHTML_TAG_DL = 0x02b
|
|
134
|
+
MyHTML_TAG_DT = 0x02c
|
|
135
|
+
MyHTML_TAG_EM = 0x02d
|
|
136
|
+
MyHTML_TAG_EMBED = 0x02e
|
|
137
|
+
MyHTML_TAG_FIELDSET = 0x02f
|
|
138
|
+
MyHTML_TAG_FIGCAPTION = 0x030
|
|
139
|
+
MyHTML_TAG_FIGURE = 0x031
|
|
140
|
+
MyHTML_TAG_FONT = 0x032
|
|
141
|
+
MyHTML_TAG_FOOTER = 0x033
|
|
142
|
+
MyHTML_TAG_FORM = 0x034
|
|
143
|
+
MyHTML_TAG_FRAME = 0x035
|
|
144
|
+
MyHTML_TAG_FRAMESET = 0x036
|
|
145
|
+
MyHTML_TAG_H1 = 0x037
|
|
146
|
+
MyHTML_TAG_H2 = 0x038
|
|
147
|
+
MyHTML_TAG_H3 = 0x039
|
|
148
|
+
MyHTML_TAG_H4 = 0x03a
|
|
149
|
+
MyHTML_TAG_H5 = 0x03b
|
|
150
|
+
MyHTML_TAG_H6 = 0x03c
|
|
151
|
+
MyHTML_TAG_HEAD = 0x03d
|
|
152
|
+
MyHTML_TAG_HEADER = 0x03e
|
|
153
|
+
MyHTML_TAG_HGROUP = 0x03f
|
|
154
|
+
MyHTML_TAG_HR = 0x040
|
|
155
|
+
MyHTML_TAG_HTML = 0x041
|
|
156
|
+
MyHTML_TAG_I = 0x042
|
|
157
|
+
MyHTML_TAG_IFRAME = 0x043
|
|
158
|
+
MyHTML_TAG_IMAGE = 0x044
|
|
159
|
+
MyHTML_TAG_IMG = 0x045
|
|
160
|
+
MyHTML_TAG_INPUT = 0x046
|
|
161
|
+
MyHTML_TAG_INS = 0x047
|
|
162
|
+
MyHTML_TAG_ISINDEX = 0x048
|
|
163
|
+
MyHTML_TAG_KBD = 0x049
|
|
164
|
+
MyHTML_TAG_KEYGEN = 0x04a
|
|
165
|
+
MyHTML_TAG_LABEL = 0x04b
|
|
166
|
+
MyHTML_TAG_LEGEND = 0x04c
|
|
167
|
+
MyHTML_TAG_LI = 0x04d
|
|
168
|
+
MyHTML_TAG_LINK = 0x04e
|
|
169
|
+
MyHTML_TAG_LISTING = 0x04f
|
|
170
|
+
MyHTML_TAG_MAIN = 0x050
|
|
171
|
+
MyHTML_TAG_MAP = 0x051
|
|
172
|
+
MyHTML_TAG_MARK = 0x052
|
|
173
|
+
MyHTML_TAG_MARQUEE = 0x053
|
|
174
|
+
MyHTML_TAG_MENU = 0x054
|
|
175
|
+
MyHTML_TAG_MENUITEM = 0x055
|
|
176
|
+
MyHTML_TAG_META = 0x056
|
|
177
|
+
MyHTML_TAG_METER = 0x057
|
|
178
|
+
MyHTML_TAG_MTEXT = 0x058
|
|
179
|
+
MyHTML_TAG_NAV = 0x059
|
|
180
|
+
MyHTML_TAG_NOBR = 0x05a
|
|
181
|
+
MyHTML_TAG_NOEMBED = 0x05b
|
|
182
|
+
MyHTML_TAG_NOFRAMES = 0x05c
|
|
183
|
+
MyHTML_TAG_NOSCRIPT = 0x05d
|
|
184
|
+
MyHTML_TAG_OBJECT = 0x05e
|
|
185
|
+
MyHTML_TAG_OL = 0x05f
|
|
186
|
+
MyHTML_TAG_OPTGROUP = 0x060
|
|
187
|
+
MyHTML_TAG_OPTION = 0x061
|
|
188
|
+
MyHTML_TAG_OUTPUT = 0x062
|
|
189
|
+
MyHTML_TAG_P = 0x063
|
|
190
|
+
MyHTML_TAG_PARAM = 0x064
|
|
191
|
+
MyHTML_TAG_PLAINTEXT = 0x065
|
|
192
|
+
MyHTML_TAG_PRE = 0x066
|
|
193
|
+
MyHTML_TAG_PROGRESS = 0x067
|
|
194
|
+
MyHTML_TAG_Q = 0x068
|
|
195
|
+
MyHTML_TAG_RB = 0x069
|
|
196
|
+
MyHTML_TAG_RP = 0x06a
|
|
197
|
+
MyHTML_TAG_RT = 0x06b
|
|
198
|
+
MyHTML_TAG_RTC = 0x06c
|
|
199
|
+
MyHTML_TAG_RUBY = 0x06d
|
|
200
|
+
MyHTML_TAG_S = 0x06e
|
|
201
|
+
MyHTML_TAG_SAMP = 0x06f
|
|
202
|
+
MyHTML_TAG_SCRIPT = 0x070
|
|
203
|
+
MyHTML_TAG_SECTION = 0x071
|
|
204
|
+
MyHTML_TAG_SELECT = 0x072
|
|
205
|
+
MyHTML_TAG_SMALL = 0x073
|
|
206
|
+
MyHTML_TAG_SOURCE = 0x074
|
|
207
|
+
MyHTML_TAG_SPAN = 0x075
|
|
208
|
+
MyHTML_TAG_STRIKE = 0x076
|
|
209
|
+
MyHTML_TAG_STRONG = 0x077
|
|
210
|
+
MyHTML_TAG_STYLE = 0x078
|
|
211
|
+
MyHTML_TAG_SUB = 0x079
|
|
212
|
+
MyHTML_TAG_SUMMARY = 0x07a
|
|
213
|
+
MyHTML_TAG_SUP = 0x07b
|
|
214
|
+
MyHTML_TAG_SVG = 0x07c
|
|
215
|
+
MyHTML_TAG_TABLE = 0x07d
|
|
216
|
+
MyHTML_TAG_TBODY = 0x07e
|
|
217
|
+
MyHTML_TAG_TD = 0x07f
|
|
218
|
+
MyHTML_TAG_TEMPLATE = 0x080
|
|
219
|
+
MyHTML_TAG_TEXTAREA = 0x081
|
|
220
|
+
MyHTML_TAG_TFOOT = 0x082
|
|
221
|
+
MyHTML_TAG_TH = 0x083
|
|
222
|
+
MyHTML_TAG_THEAD = 0x084
|
|
223
|
+
MyHTML_TAG_TIME = 0x085
|
|
224
|
+
MyHTML_TAG_TITLE = 0x086
|
|
225
|
+
MyHTML_TAG_TR = 0x087
|
|
226
|
+
MyHTML_TAG_TRACK = 0x088
|
|
227
|
+
MyHTML_TAG_TT = 0x089
|
|
228
|
+
MyHTML_TAG_U = 0x08a
|
|
229
|
+
MyHTML_TAG_UL = 0x08b
|
|
230
|
+
MyHTML_TAG_VAR = 0x08c
|
|
231
|
+
MyHTML_TAG_VIDEO = 0x08d
|
|
232
|
+
MyHTML_TAG_WBR = 0x08e
|
|
233
|
+
MyHTML_TAG_XMP = 0x08f
|
|
234
|
+
MyHTML_TAG_ALTGLYPH = 0x090
|
|
235
|
+
MyHTML_TAG_ALTGLYPHDEF = 0x091
|
|
236
|
+
MyHTML_TAG_ALTGLYPHITEM = 0x092
|
|
237
|
+
MyHTML_TAG_ANIMATE = 0x093
|
|
238
|
+
MyHTML_TAG_ANIMATECOLOR = 0x094
|
|
239
|
+
MyHTML_TAG_ANIMATEMOTION = 0x095
|
|
240
|
+
MyHTML_TAG_ANIMATETRANSFORM = 0x096
|
|
241
|
+
MyHTML_TAG_CIRCLE = 0x097
|
|
242
|
+
MyHTML_TAG_CLIPPATH = 0x098
|
|
243
|
+
MyHTML_TAG_COLOR_PROFILE = 0x099
|
|
244
|
+
MyHTML_TAG_CURSOR = 0x09a
|
|
245
|
+
MyHTML_TAG_DEFS = 0x09b
|
|
246
|
+
MyHTML_TAG_DESC = 0x09c
|
|
247
|
+
MyHTML_TAG_ELLIPSE = 0x09d
|
|
248
|
+
MyHTML_TAG_FEBLEND = 0x09e
|
|
249
|
+
MyHTML_TAG_FECOLORMATRIX = 0x09f
|
|
250
|
+
MyHTML_TAG_FECOMPONENTTRANSFER = 0x0a0
|
|
251
|
+
MyHTML_TAG_FECOMPOSITE = 0x0a1
|
|
252
|
+
MyHTML_TAG_FECONVOLVEMATRIX = 0x0a2
|
|
253
|
+
MyHTML_TAG_FEDIFFUSELIGHTING = 0x0a3
|
|
254
|
+
MyHTML_TAG_FEDISPLACEMENTMAP = 0x0a4
|
|
255
|
+
MyHTML_TAG_FEDISTANTLIGHT = 0x0a5
|
|
256
|
+
MyHTML_TAG_FEDROPSHADOW = 0x0a6
|
|
257
|
+
MyHTML_TAG_FEFLOOD = 0x0a7
|
|
258
|
+
MyHTML_TAG_FEFUNCA = 0x0a8
|
|
259
|
+
MyHTML_TAG_FEFUNCB = 0x0a9
|
|
260
|
+
MyHTML_TAG_FEFUNCG = 0x0aa
|
|
261
|
+
MyHTML_TAG_FEFUNCR = 0x0ab
|
|
262
|
+
MyHTML_TAG_FEGAUSSIANBLUR = 0x0ac
|
|
263
|
+
MyHTML_TAG_FEIMAGE = 0x0ad
|
|
264
|
+
MyHTML_TAG_FEMERGE = 0x0ae
|
|
265
|
+
MyHTML_TAG_FEMERGENODE = 0x0af
|
|
266
|
+
MyHTML_TAG_FEMORPHOLOGY = 0x0b0
|
|
267
|
+
MyHTML_TAG_FEOFFSET = 0x0b1
|
|
268
|
+
MyHTML_TAG_FEPOINTLIGHT = 0x0b2
|
|
269
|
+
MyHTML_TAG_FESPECULARLIGHTING = 0x0b3
|
|
270
|
+
MyHTML_TAG_FESPOTLIGHT = 0x0b4
|
|
271
|
+
MyHTML_TAG_FETILE = 0x0b5
|
|
272
|
+
MyHTML_TAG_FETURBULENCE = 0x0b6
|
|
273
|
+
MyHTML_TAG_FILTER = 0x0b7
|
|
274
|
+
MyHTML_TAG_FONT_FACE = 0x0b8
|
|
275
|
+
MyHTML_TAG_FONT_FACE_FORMAT = 0x0b9
|
|
276
|
+
MyHTML_TAG_FONT_FACE_NAME = 0x0ba
|
|
277
|
+
MyHTML_TAG_FONT_FACE_SRC = 0x0bb
|
|
278
|
+
MyHTML_TAG_FONT_FACE_URI = 0x0bc
|
|
279
|
+
MyHTML_TAG_FOREIGNOBJECT = 0x0bd
|
|
280
|
+
MyHTML_TAG_G = 0x0be
|
|
281
|
+
MyHTML_TAG_GLYPH = 0x0bf
|
|
282
|
+
MyHTML_TAG_GLYPHREF = 0x0c0
|
|
283
|
+
MyHTML_TAG_HKERN = 0x0c1
|
|
284
|
+
MyHTML_TAG_LINE = 0x0c2
|
|
285
|
+
MyHTML_TAG_LINEARGRADIENT = 0x0c3
|
|
286
|
+
MyHTML_TAG_MARKER = 0x0c4
|
|
287
|
+
MyHTML_TAG_MASK = 0x0c5
|
|
288
|
+
MyHTML_TAG_METADATA = 0x0c6
|
|
289
|
+
MyHTML_TAG_MISSING_GLYPH = 0x0c7
|
|
290
|
+
MyHTML_TAG_MPATH = 0x0c8
|
|
291
|
+
MyHTML_TAG_PATH = 0x0c9
|
|
292
|
+
MyHTML_TAG_PATTERN = 0x0ca
|
|
293
|
+
MyHTML_TAG_POLYGON = 0x0cb
|
|
294
|
+
MyHTML_TAG_POLYLINE = 0x0cc
|
|
295
|
+
MyHTML_TAG_RADIALGRADIENT = 0x0cd
|
|
296
|
+
MyHTML_TAG_RECT = 0x0ce
|
|
297
|
+
MyHTML_TAG_SET = 0x0cf
|
|
298
|
+
MyHTML_TAG_STOP = 0x0d0
|
|
299
|
+
MyHTML_TAG_SWITCH = 0x0d1
|
|
300
|
+
MyHTML_TAG_SYMBOL = 0x0d2
|
|
301
|
+
MyHTML_TAG_TEXT = 0x0d3
|
|
302
|
+
MyHTML_TAG_TEXTPATH = 0x0d4
|
|
303
|
+
MyHTML_TAG_TREF = 0x0d5
|
|
304
|
+
MyHTML_TAG_TSPAN = 0x0d6
|
|
305
|
+
MyHTML_TAG_USE = 0x0d7
|
|
306
|
+
MyHTML_TAG_VIEW = 0x0d8
|
|
307
|
+
MyHTML_TAG_VKERN = 0x0d9
|
|
308
|
+
MyHTML_TAG_MATH = 0x0da
|
|
309
|
+
MyHTML_TAG_MACTION = 0x0db
|
|
310
|
+
MyHTML_TAG_MALIGNGROUP = 0x0dc
|
|
311
|
+
MyHTML_TAG_MALIGNMARK = 0x0dd
|
|
312
|
+
MyHTML_TAG_MENCLOSE = 0x0de
|
|
313
|
+
MyHTML_TAG_MERROR = 0x0df
|
|
314
|
+
MyHTML_TAG_MFENCED = 0x0e0
|
|
315
|
+
MyHTML_TAG_MFRAC = 0x0e1
|
|
316
|
+
MyHTML_TAG_MGLYPH = 0x0e2
|
|
317
|
+
MyHTML_TAG_MI = 0x0e3
|
|
318
|
+
MyHTML_TAG_MLABELEDTR = 0x0e4
|
|
319
|
+
MyHTML_TAG_MLONGDIV = 0x0e5
|
|
320
|
+
MyHTML_TAG_MMULTISCRIPTS = 0x0e6
|
|
321
|
+
MyHTML_TAG_MN = 0x0e7
|
|
322
|
+
MyHTML_TAG_MO = 0x0e8
|
|
323
|
+
MyHTML_TAG_MOVER = 0x0e9
|
|
324
|
+
MyHTML_TAG_MPADDED = 0x0ea
|
|
325
|
+
MyHTML_TAG_MPHANTOM = 0x0eb
|
|
326
|
+
MyHTML_TAG_MROOT = 0x0ec
|
|
327
|
+
MyHTML_TAG_MROW = 0x0ed
|
|
328
|
+
MyHTML_TAG_MS = 0x0ee
|
|
329
|
+
MyHTML_TAG_MSCARRIES = 0x0ef
|
|
330
|
+
MyHTML_TAG_MSCARRY = 0x0f0
|
|
331
|
+
MyHTML_TAG_MSGROUP = 0x0f1
|
|
332
|
+
MyHTML_TAG_MSLINE = 0x0f2
|
|
333
|
+
MyHTML_TAG_MSPACE = 0x0f3
|
|
334
|
+
MyHTML_TAG_MSQRT = 0x0f4
|
|
335
|
+
MyHTML_TAG_MSROW = 0x0f5
|
|
336
|
+
MyHTML_TAG_MSTACK = 0x0f6
|
|
337
|
+
MyHTML_TAG_MSTYLE = 0x0f7
|
|
338
|
+
MyHTML_TAG_MSUB = 0x0f8
|
|
339
|
+
MyHTML_TAG_MSUP = 0x0f9
|
|
340
|
+
MyHTML_TAG_MSUBSUP = 0x0fa
|
|
341
|
+
MyHTML_TAG__END_OF_FILE = 0x0fb
|
|
342
|
+
MyHTML_TAG_FIRST_ENTRY = MyHTML_TAG__TEXT
|
|
343
|
+
MyHTML_TAG_LAST_ENTRY = 0x0fc
|
|
344
|
+
|
|
345
|
+
ctypedef enum myhtml_tree_parse_flags_t:
|
|
346
|
+
MyHTML_TREE_PARSE_FLAGS_CLEAN = 0x000
|
|
347
|
+
MyHTML_TREE_PARSE_FLAGS_WITHOUT_BUILD_TREE = 0x001
|
|
348
|
+
MyHTML_TREE_PARSE_FLAGS_WITHOUT_PROCESS_TOKEN = 0x003
|
|
349
|
+
MyHTML_TREE_PARSE_FLAGS_SKIP_WHITESPACE_TOKEN = 0x004
|
|
350
|
+
MyHTML_TREE_PARSE_FLAGS_WITHOUT_DOCTYPE_IN_TREE = 0x008
|
|
351
|
+
|
|
352
|
+
ctypedef struct myhtml_token_node_t:
|
|
353
|
+
myhtml_tag_id_t tag_id
|
|
354
|
+
|
|
355
|
+
mycore_string_t str
|
|
356
|
+
|
|
357
|
+
size_t raw_begin
|
|
358
|
+
size_t raw_length
|
|
359
|
+
|
|
360
|
+
size_t element_begin
|
|
361
|
+
size_t element_length
|
|
362
|
+
|
|
363
|
+
myhtml_token_attr_t* attr_first
|
|
364
|
+
myhtml_token_attr_t* attr_last
|
|
365
|
+
|
|
366
|
+
myhtml_token_type type
|
|
367
|
+
|
|
368
|
+
ctypedef struct myhtml_token_attr_t:
|
|
369
|
+
myhtml_token_attr_t* next
|
|
370
|
+
myhtml_token_attr_t* prev
|
|
371
|
+
|
|
372
|
+
mycore_string_t key
|
|
373
|
+
mycore_string_t value
|
|
374
|
+
|
|
375
|
+
size_t raw_key_begin
|
|
376
|
+
size_t raw_key_length
|
|
377
|
+
size_t raw_value_begin
|
|
378
|
+
size_t raw_value_length
|
|
379
|
+
|
|
380
|
+
myhtml_namespace ns
|
|
381
|
+
|
|
382
|
+
ctypedef struct myhtml_tree_attr_t:
|
|
383
|
+
myhtml_tree_attr_t* next
|
|
384
|
+
myhtml_tree_attr_t* prev
|
|
385
|
+
|
|
386
|
+
mycore_string_t key
|
|
387
|
+
mycore_string_t value
|
|
388
|
+
|
|
389
|
+
size_t raw_key_begin
|
|
390
|
+
size_t raw_key_length
|
|
391
|
+
size_t raw_value_begin
|
|
392
|
+
size_t raw_value_length
|
|
393
|
+
|
|
394
|
+
myhtml_t * myhtml_create()
|
|
395
|
+
mystatus_t myhtml_init(myhtml_t* myhtml, myhtml_options opt, size_t thread_count, size_t queue_size)
|
|
396
|
+
myhtml_tree_t * myhtml_tree_create()
|
|
397
|
+
mystatus_t myhtml_tree_init(myhtml_tree_t* tree, myhtml_t* myhtml)
|
|
398
|
+
mystatus_t myhtml_parse(myhtml_tree_t* tree, myencoding_t encoding, const char* html, size_t html_size)
|
|
399
|
+
|
|
400
|
+
myhtml_tree_attr_t* myhtml_node_attribute_first(myhtml_tree_node_t* node)
|
|
401
|
+
myhtml_tree_attr_t* myhtml_attribute_by_key(myhtml_tree_node_t *node, const char *key, size_t key_len)
|
|
402
|
+
const char* myhtml_node_text(myhtml_tree_node_t *node, size_t *length)
|
|
403
|
+
mycore_string_t * myhtml_node_string(myhtml_tree_node_t *node)
|
|
404
|
+
const char * myhtml_tag_name_by_id(myhtml_tree_t* tree, myhtml_tag_id_t tag_id, size_t *length)
|
|
405
|
+
|
|
406
|
+
myhtml_collection_t * myhtml_collection_destroy(myhtml_collection_t *collection)
|
|
407
|
+
myhtml_tree_t * myhtml_tree_destroy(myhtml_tree_t* tree)
|
|
408
|
+
myhtml_t* myhtml_destroy(myhtml_t* myhtml)
|
|
409
|
+
|
|
410
|
+
myhtml_tree_node_t* myhtml_tree_get_document(myhtml_tree_t* tree)
|
|
411
|
+
myhtml_tree_node_t* myhtml_tree_get_node_body(myhtml_tree_t* tree)
|
|
412
|
+
myhtml_tree_node_t* myhtml_tree_get_node_head(myhtml_tree_t* tree)
|
|
413
|
+
|
|
414
|
+
myhtml_collection_t* myhtml_get_nodes_by_name(myhtml_tree_t* tree, myhtml_collection_t *collection,
|
|
415
|
+
const char* name, size_t length, mystatus_t *status)
|
|
416
|
+
|
|
417
|
+
void myhtml_node_delete(myhtml_tree_node_t *node)
|
|
418
|
+
void myhtml_node_delete_recursive(myhtml_tree_node_t *node)
|
|
419
|
+
void myhtml_tree_parse_flags_set(myhtml_tree_t* tree, myhtml_tree_parse_flags_t parse_flags)
|
|
420
|
+
myhtml_tree_node_t * myhtml_node_insert_before(myhtml_tree_node_t *target, myhtml_tree_node_t *node)
|
|
421
|
+
myhtml_tree_node_t * myhtml_node_insert_after(myhtml_tree_node_t *target, myhtml_tree_node_t *node)
|
|
422
|
+
myhtml_tree_node_t * myhtml_node_create(myhtml_tree_t* tree, myhtml_tag_id_t tag_id, myhtml_namespace ns)
|
|
423
|
+
myhtml_tree_node_t * myhtml_node_clone_deep(myhtml_tree_t* dest_tree, myhtml_tree_node_t* src)
|
|
424
|
+
myhtml_tree_node_t * myhtml_node_append_child(myhtml_tree_node_t* target, myhtml_tree_node_t* node)
|
|
425
|
+
|
|
426
|
+
mycore_string_t * myhtml_node_text_set(myhtml_tree_node_t *node, const char* text, size_t length,
|
|
427
|
+
myencoding_t encoding)
|
|
428
|
+
myhtml_tree_attr_t * myhtml_attribute_by_key(myhtml_tree_node_t *node, const char *key, size_t key_len)
|
|
429
|
+
myhtml_tree_attr_t * myhtml_attribute_remove_by_key(myhtml_tree_node_t *node, const char *key, size_t key_len)
|
|
430
|
+
myhtml_tree_attr_t * myhtml_attribute_add(myhtml_tree_node_t *node, const char *key, size_t key_len,
|
|
431
|
+
const char *value, size_t value_len, myencoding_t encoding)
|
|
432
|
+
|
|
433
|
+
myhtml_tree_node_t * myhtml_node_insert_to_appropriate_place(myhtml_tree_node_t *target, myhtml_tree_node_t *node)
|
|
434
|
+
|
|
435
|
+
cdef extern from "myhtml/tree.h" nogil:
|
|
436
|
+
myhtml_tree_node_t * myhtml_tree_node_clone(myhtml_tree_node_t* node)
|
|
437
|
+
myhtml_tree_node_t * myhtml_tree_node_insert_root(myhtml_tree_t* tree, myhtml_token_node_t* token,
|
|
438
|
+
myhtml_namespace ns)
|
|
439
|
+
void myhtml_tree_node_add_child(myhtml_tree_node_t* root, myhtml_tree_node_t* node)
|
|
440
|
+
|
|
441
|
+
cdef extern from "myhtml/serialization.h" nogil:
|
|
442
|
+
mystatus_t myhtml_serialization(myhtml_tree_node_t* scope_node, mycore_string_raw_t* str)
|
|
443
|
+
|
|
444
|
+
|
|
445
|
+
cdef extern from "myencoding/encoding.h" nogil:
|
|
446
|
+
ctypedef enum myencoding_t:
|
|
447
|
+
MyENCODING_DEFAULT = 0x00
|
|
448
|
+
# MyENCODING_AUTO = 0x01 // future
|
|
449
|
+
MyENCODING_NOT_DETERMINED = 0x02
|
|
450
|
+
MyENCODING_UTF_8 = 0x00 # default encoding
|
|
451
|
+
MyENCODING_UTF_16LE = 0x04
|
|
452
|
+
MyENCODING_UTF_16BE = 0x05
|
|
453
|
+
MyENCODING_X_USER_DEFINED = 0x06
|
|
454
|
+
MyENCODING_BIG5 = 0x07
|
|
455
|
+
MyENCODING_EUC_JP = 0x08
|
|
456
|
+
MyENCODING_EUC_KR = 0x09
|
|
457
|
+
MyENCODING_GB18030 = 0x0a
|
|
458
|
+
MyENCODING_GBK = 0x0b
|
|
459
|
+
MyENCODING_IBM866 = 0x0c
|
|
460
|
+
MyENCODING_ISO_2022_JP = 0x0d
|
|
461
|
+
MyENCODING_ISO_8859_10 = 0x0e
|
|
462
|
+
MyENCODING_ISO_8859_13 = 0x0f
|
|
463
|
+
MyENCODING_ISO_8859_14 = 0x10
|
|
464
|
+
MyENCODING_ISO_8859_15 = 0x11
|
|
465
|
+
MyENCODING_ISO_8859_16 = 0x12
|
|
466
|
+
MyENCODING_ISO_8859_2 = 0x13
|
|
467
|
+
MyENCODING_ISO_8859_3 = 0x14
|
|
468
|
+
MyENCODING_ISO_8859_4 = 0x15
|
|
469
|
+
MyENCODING_ISO_8859_5 = 0x16
|
|
470
|
+
MyENCODING_ISO_8859_6 = 0x17
|
|
471
|
+
MyENCODING_ISO_8859_7 = 0x18
|
|
472
|
+
MyENCODING_ISO_8859_8 = 0x19
|
|
473
|
+
MyENCODING_ISO_8859_8_I = 0x1a
|
|
474
|
+
MyENCODING_KOI8_R = 0x1b
|
|
475
|
+
MyENCODING_KOI8_U = 0x1c
|
|
476
|
+
MyENCODING_MACINTOSH = 0x1d
|
|
477
|
+
MyENCODING_SHIFT_JIS = 0x1e
|
|
478
|
+
MyENCODING_WINDOWS_1250 = 0x1f
|
|
479
|
+
MyENCODING_WINDOWS_1251 = 0x20
|
|
480
|
+
MyENCODING_WINDOWS_1252 = 0x21
|
|
481
|
+
MyENCODING_WINDOWS_1253 = 0x22
|
|
482
|
+
MyENCODING_WINDOWS_1254 = 0x23
|
|
483
|
+
MyENCODING_WINDOWS_1255 = 0x24
|
|
484
|
+
MyENCODING_WINDOWS_1256 = 0x25
|
|
485
|
+
MyENCODING_WINDOWS_1257 = 0x26
|
|
486
|
+
MyENCODING_WINDOWS_1258 = 0x27
|
|
487
|
+
MyENCODING_WINDOWS_874 = 0x28
|
|
488
|
+
MyENCODING_X_MAC_CYRILLIC = 0x29
|
|
489
|
+
MyENCODING_LAST_ENTRY = 0x2a
|
|
490
|
+
|
|
491
|
+
bint myencoding_detect_bom(const char *text, size_t length, myencoding_t *encoding)
|
|
492
|
+
bint myencoding_detect(const char *text, size_t length, myencoding_t *encoding)
|
|
493
|
+
myencoding_t myencoding_prescan_stream_to_determine_encoding(const char *data, size_t data_size)
|
|
494
|
+
const char* myencoding_name_by_id(myencoding_t encoding, size_t *length)
|
|
495
|
+
|
|
496
|
+
|
|
497
|
+
cdef extern from "mycss/mycss.h" nogil:
|
|
498
|
+
ctypedef struct mycss_entry_t:
|
|
499
|
+
# not completed struct
|
|
500
|
+
mycss_t* mycss
|
|
501
|
+
|
|
502
|
+
ctypedef struct mycss_t
|
|
503
|
+
|
|
504
|
+
ctypedef struct mycss_selectors_t
|
|
505
|
+
|
|
506
|
+
ctypedef struct mycss_selectors_entries_list_t
|
|
507
|
+
ctypedef struct mycss_declaration_entry_t
|
|
508
|
+
|
|
509
|
+
ctypedef enum mycss_selectors_flags:
|
|
510
|
+
MyCSS_SELECTORS_FLAGS_UNDEF = 0x00
|
|
511
|
+
MyCSS_SELECTORS_FLAGS_SELECTOR_BAD = 0x01
|
|
512
|
+
ctypedef mycss_selectors_flags mycss_selectors_flags_t
|
|
513
|
+
|
|
514
|
+
ctypedef struct mycss_selectors_list_t:
|
|
515
|
+
mycss_selectors_entries_list_t* entries_list
|
|
516
|
+
size_t entries_list_length
|
|
517
|
+
|
|
518
|
+
mycss_declaration_entry_t* declaration_entry
|
|
519
|
+
|
|
520
|
+
mycss_selectors_flags_t flags
|
|
521
|
+
|
|
522
|
+
mycss_selectors_list_t* parent
|
|
523
|
+
mycss_selectors_list_t* next
|
|
524
|
+
mycss_selectors_list_t* prev
|
|
525
|
+
|
|
526
|
+
# CSS init routines
|
|
527
|
+
mycss_t * mycss_create()
|
|
528
|
+
mystatus_t mycss_init(mycss_t* mycss)
|
|
529
|
+
mycss_entry_t * mycss_entry_create()
|
|
530
|
+
mystatus_t mycss_entry_init(mycss_t* mycss, mycss_entry_t* entry)
|
|
531
|
+
|
|
532
|
+
mycss_selectors_list_t * mycss_selectors_parse(mycss_selectors_t* selectors, myencoding_t encoding,
|
|
533
|
+
const char* data, size_t data_size, mystatus_t* out_status)
|
|
534
|
+
mycss_selectors_t * mycss_entry_selectors(mycss_entry_t* entry)
|
|
535
|
+
|
|
536
|
+
mycss_selectors_list_t * mycss_selectors_list_destroy(mycss_selectors_t* selectors,
|
|
537
|
+
mycss_selectors_list_t* selectors_list, bint self_destroy)
|
|
538
|
+
mycss_entry_t * mycss_entry_destroy(mycss_entry_t* entry, bint self_destroy)
|
|
539
|
+
mycss_t * mycss_destroy(mycss_t* mycss, bint self_destroy)
|
|
540
|
+
|
|
541
|
+
|
|
542
|
+
cdef extern from "modest/finder/finder.h" nogil:
|
|
543
|
+
ctypedef struct modest_finder_t
|
|
544
|
+
modest_finder_t* modest_finder_create_simple()
|
|
545
|
+
mystatus_t modest_finder_by_selectors_list(modest_finder_t* finder, myhtml_tree_node_t* scope_node,
|
|
546
|
+
mycss_selectors_list_t* selector_list, myhtml_collection_t** collection)
|
|
547
|
+
modest_finder_t * modest_finder_destroy(modest_finder_t* finder, bint self_destroy)
|
|
548
|
+
|
|
549
|
+
|
|
550
|
+
cdef class HTMLParser:
|
|
551
|
+
cdef myhtml_tree_t *html_tree
|
|
552
|
+
cdef public bint detect_encoding
|
|
553
|
+
cdef public bint use_meta_tags
|
|
554
|
+
cdef myencoding_t _encoding
|
|
555
|
+
cdef public unicode decode_errors
|
|
556
|
+
cdef public bytes raw_html
|
|
557
|
+
cdef object cached_script_texts
|
|
558
|
+
cdef object cached_script_srcs
|
|
559
|
+
|
|
560
|
+
cdef void _detect_encoding(self, char* html, size_t html_len) nogil
|
|
561
|
+
cdef int _parse_html(self, char* html, size_t html_len) except -1
|
|
562
|
+
|
|
563
|
+
@staticmethod
|
|
564
|
+
cdef HTMLParser from_tree(
|
|
565
|
+
myhtml_tree_t * tree, bytes raw_html, bint detect_encoding, bint use_meta_tags, str decode_errors,
|
|
566
|
+
myencoding_t encoding
|
|
567
|
+
)
|
|
568
|
+
|
|
569
|
+
|
|
570
|
+
cdef class Stack:
|
|
571
|
+
cdef size_t capacity
|
|
572
|
+
cdef size_t top
|
|
573
|
+
cdef myhtml_tree_node_t ** _stack
|
|
574
|
+
|
|
575
|
+
cdef bint is_empty(self)
|
|
576
|
+
cdef int push(self, myhtml_tree_node_t* res) except -1
|
|
577
|
+
cdef myhtml_tree_node_t * pop(self)
|
|
578
|
+
cdef int resize(self) except -1
|