gumbo-html 0.2.3 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. package/README.md +27 -10
  2. package/binding.gyp +49 -0
  3. package/examples/example.js +87 -0
  4. package/examples/scrape.js +301 -0
  5. package/index.d.ts +58 -3
  6. package/index.js +7 -2
  7. package/lib/wrapper.js +385 -0
  8. package/package.json +36 -5
  9. package/src/addon.cc +19 -0
  10. package/src/gumbo-parser/COPYING +201 -0
  11. package/src/gumbo-parser/README.md +8 -0
  12. package/src/gumbo-parser/src/attribute.c +44 -0
  13. package/src/gumbo-parser/src/attribute.h +37 -0
  14. package/src/gumbo-parser/src/char_ref.c +23069 -0
  15. package/src/gumbo-parser/src/char_ref.h +60 -0
  16. package/src/gumbo-parser/src/error.c +279 -0
  17. package/src/gumbo-parser/src/error.h +225 -0
  18. package/src/gumbo-parser/src/gumbo.h +671 -0
  19. package/src/gumbo-parser/src/insertion_mode.h +57 -0
  20. package/src/gumbo-parser/src/parser.c +4192 -0
  21. package/src/gumbo-parser/src/parser.h +57 -0
  22. package/src/gumbo-parser/src/string_buffer.c +110 -0
  23. package/src/gumbo-parser/src/string_buffer.h +84 -0
  24. package/src/gumbo-parser/src/string_piece.c +48 -0
  25. package/src/gumbo-parser/src/string_piece.h +38 -0
  26. package/src/gumbo-parser/src/tag.c +95 -0
  27. package/src/gumbo-parser/src/tag_enum.h +153 -0
  28. package/src/gumbo-parser/src/tag_gperf.h +105 -0
  29. package/src/gumbo-parser/src/tag_sizes.h +4 -0
  30. package/src/gumbo-parser/src/tag_strings.h +153 -0
  31. package/src/gumbo-parser/src/token_type.h +41 -0
  32. package/src/gumbo-parser/src/tokenizer.c +2897 -0
  33. package/src/gumbo-parser/src/tokenizer.h +123 -0
  34. package/src/gumbo-parser/src/tokenizer_states.h +103 -0
  35. package/src/gumbo-parser/src/utf8.c +270 -0
  36. package/src/gumbo-parser/src/utf8.h +132 -0
  37. package/src/gumbo-parser/src/util.c +58 -0
  38. package/src/gumbo-parser/src/util.h +60 -0
  39. package/src/gumbo-parser/src/vector.c +123 -0
  40. package/src/gumbo-parser/src/vector.h +67 -0
  41. package/src/html_document.cc +411 -0
  42. package/src/html_document.h +56 -0
  43. package/src/html_element.cc +963 -0
  44. package/src/html_element.h +70 -0
  45. package/src/include/win/strings.h +11 -0
  46. package/src/jsa.c +182 -0
  47. package/src/jsa.h +44 -0
  48. package/src/xnode.c +372 -0
  49. package/src/xnode_query.c +330 -0
  50. package/src/xnode_query.h +186 -0
  51. package/src/xnode_query_parser.c +414 -0
  52. package/install.js +0 -15
package/README.md CHANGED
@@ -1,26 +1,43 @@
1
+ # gumbo-html
2
+
1
3
  CSS selector based on Gumbo HTML parser.
2
4
 
3
5
  ## Installation
6
+
7
+ ```sh
8
+ npm install gumbo-html
4
9
  ```
5
- $ npm install gumbo-html
6
- ```
10
+
11
+ `gumbo-html` is a native Node.js addon and is compiled from source on install.
12
+ You'll need the standard `node-gyp` toolchain:
13
+
14
+ - Python 3
15
+ - A C/C++ compiler (Xcode Command Line Tools on macOS, `build-essential` on
16
+ Linux, or Visual Studio Build Tools on Windows)
17
+
18
+ See the [node-gyp docs](https://github.com/nodejs/node-gyp#installation) for
19
+ platform-specific setup details.
7
20
 
8
21
  ## Usage
9
22
 
10
- Example:
11
23
  ```ts
12
- import {parse} from 'gumbo-html';
24
+ import { parse } from 'gumbo-html';
13
25
 
14
- const html = `
26
+ const html = `
15
27
  <html>
16
28
  <p class="foo bar blah">Foo</p>
17
29
  <p class="bar">Bar</p>
18
30
  </html>
19
- `
31
+ `;
20
32
 
21
- const xdoc = parse(html);
33
+ const doc = parse(html);
22
34
 
23
- xdoc.find('.bar').forEach(el => {
24
- console.log(el.innerText)
35
+ doc.find('.bar').forEach((el) => {
36
+ console.log(el.innerText);
25
37
  });
26
- ```
38
+ ```
39
+
40
+ ## License
41
+
42
+ MIT. Bundles [google/gumbo-parser](https://github.com/google/gumbo-parser)
43
+ (Apache-2.0) under `src/gumbo-parser/`.
package/binding.gyp ADDED
@@ -0,0 +1,49 @@
1
+ {
2
+ "targets": [
3
+ {
4
+ "target_name": "html",
5
+ "sources": [
6
+ "src/addon.cc",
7
+ "src/html_document.cc",
8
+ "src/html_element.cc",
9
+ "src/xnode.c",
10
+ "src/xnode_query.c",
11
+ "src/xnode_query_parser.c",
12
+ "src/jsa.c",
13
+ "src/gumbo-parser/src/attribute.c",
14
+ "src/gumbo-parser/src/error.c",
15
+ "src/gumbo-parser/src/string_buffer.c",
16
+ "src/gumbo-parser/src/tag.c",
17
+ "src/gumbo-parser/src/utf8.c",
18
+ "src/gumbo-parser/src/vector.c",
19
+ "src/gumbo-parser/src/char_ref.c",
20
+ "src/gumbo-parser/src/parser.c",
21
+ "src/gumbo-parser/src/string_piece.c",
22
+ "src/gumbo-parser/src/tokenizer.c",
23
+ "src/gumbo-parser/src/util.c"
24
+ ],
25
+ "include_dirs": [
26
+ "<!@(node -p \"require('node-addon-api').include\")",
27
+ "src",
28
+ "src/gumbo-parser/src"
29
+ ],
30
+ "cflags!": ["-fno-exceptions"],
31
+ "cflags_cc!": ["-fno-exceptions"],
32
+ "xcode_settings": {
33
+ "GCC_ENABLE_CPP_EXCEPTIONS": "YES",
34
+ "CLANG_CXX_LIBRARY": "libc++",
35
+ "MACOSX_DEPLOYMENT_TARGET": "10.15"
36
+ },
37
+ "msvs_settings": {
38
+ "VCCLCompilerTool": {
39
+ "ExceptionHandling": 1
40
+ }
41
+ },
42
+ "conditions": [
43
+ ["OS==\"win\"", {
44
+ "include_dirs": ["src/include/win"]
45
+ }]
46
+ ]
47
+ }
48
+ ]
49
+ }
@@ -0,0 +1,87 @@
1
+ 'use strict';
2
+
3
+ const { parse } = require('..');
4
+
5
+ const html = `
6
+ <!doctype html>
7
+ <html>
8
+ <body>
9
+ <main id="content">
10
+ <article class="post featured" data-slug="hello-world">
11
+ <h1>Hello world</h1>
12
+ <p class="summary">A short introduction.</p>
13
+ <a class="cta primary" href="/hello">Read more</a>
14
+ </article>
15
+
16
+ <article class="post" data-slug="second-post">
17
+ <h1>Second post</h1>
18
+ <p class="summary">A follow-up note.</p>
19
+ <a class="cta" href="/second">Open post</a>
20
+ </article>
21
+ </main>
22
+ </body>
23
+ </html>
24
+ `;
25
+
26
+ const doc = parse(html);
27
+
28
+ // documentElement returns the parsed <html> element.
29
+ console.log('Root tag:', doc.documentElement.tagName);
30
+
31
+ // find(selector) returns every matching element under the document or element.
32
+ const posts = doc.find('article.post');
33
+ console.log('Post count:', posts.length);
34
+
35
+ posts.forEach((post, index) => {
36
+ // attr(name) returns undefined when the attribute is missing.
37
+ console.log(`Post ${index + 1}:`, post.attr('data-slug'));
38
+
39
+ // first(selector) returns the first match or null.
40
+ const title = post.first('h1');
41
+ console.log(' title:', title ? title.innerText : '(missing)');
42
+
43
+ // hasClass(name) and hasAttribute(name) are convenience checks.
44
+ console.log(' featured:', post.hasClass('featured'));
45
+ console.log(' has slug:', post.hasAttribute('data-slug'));
46
+ });
47
+
48
+ // first_s(selector) is the throwing version of first(selector).
49
+ // Use it when the element is required for the rest of your code.
50
+ const content = doc.first_s('#content');
51
+ console.log('Main outerHTML starts with:', content.outerHTML.slice(0, 20));
52
+
53
+ // only(selector) returns the match only when exactly one element is found.
54
+ const featuredPost = doc.only('article.featured');
55
+ console.log('Featured slug:', featuredPost.attr_s('data-slug'));
56
+
57
+ // only_s(selector) throws unless exactly one element is found.
58
+ try {
59
+ doc.only_s('article.post');
60
+ } catch (error) {
61
+ console.log('only_s on many posts:', error.message);
62
+ }
63
+
64
+ // Element-scoped queries search only inside that element.
65
+ const firstPost = doc.first_s('article.post');
66
+ console.log('CTA href:', firstPost.first_s('a.cta').attr_s('href'));
67
+
68
+ // next(selector) and prev(selector) walk element siblings.
69
+ const secondPost = firstPost.next('article.post');
70
+ console.log('Next post slug:', secondPost.attr_s('data-slug'));
71
+ console.log('Previous post slug:', secondPost.prev('article.post').attr_s('data-slug'));
72
+
73
+ // childNodes includes text/whitespace nodes as well as element nodes.
74
+ // nodeType helps distinguish them.
75
+ const childTypes = content.childNodes.map((node) => node.nodeType);
76
+ console.log('Main child node types:', childTypes.join(', '));
77
+
78
+ // parent returns the parent element, or null for the document root.
79
+ console.log('First post parent:', firstPost.parent.tagName);
80
+ console.log('Document root parent:', doc.documentElement.parent);
81
+
82
+ // attr_s(name) throws when an attribute is required but missing.
83
+ try {
84
+ firstPost.attr_s('missing');
85
+ } catch (error) {
86
+ console.log('attr_s on missing attribute:', error.message);
87
+ }
@@ -0,0 +1,301 @@
1
+ 'use strict';
2
+
3
+ /**
4
+ * gumbo-html examples
5
+ *
6
+ * Demonstrates all the new features including:
7
+ * - Friendly aliases (firstOrThrow, onlyOrThrow, attrOrThrow)
8
+ * - Convenience methods (exists, count, text, attr with selector)
9
+ * - Traversal (closest, children, siblings, matches, is)
10
+ * - Table extraction (rows, table)
11
+ * - Text normalization
12
+ * - Structured extraction
13
+ * - URL resolution with baseUrl
14
+ * - Common extractors (meta, links, images, title, etc.)
15
+ */
16
+
17
+ const html = require('..');
18
+
19
+ // ============================================================
20
+ // Parse a HTML page
21
+ // ============================================================
22
+ const HTML = `
23
+ <!DOCTYPE html>
24
+ <html>
25
+ <head>
26
+ <title>Example Blog</title>
27
+ <meta name="description" content="A blog about web scraping">
28
+ <meta property="og:title" content="Example Blog OG">
29
+ <meta property="og:image" content="/images/og.png">
30
+ <link rel="canonical" href="https://example.com/blog/">
31
+ </head>
32
+ <body>
33
+ <article class="post featured">
34
+ <h1> Getting Started with Web Scraping </h1>
35
+ <p>First paragraph of content.</p>
36
+ <a href="/post/getting-started">Read More</a>
37
+ <img src="/images/scraping.png" alt="Web Scraping">
38
+ <ul class="tags">
39
+ <li>scraping</li>
40
+ <li>html</li>
41
+ <li>tutorial</li>
42
+ </ul>
43
+ </article>
44
+
45
+ <article class="post">
46
+ <h1>Advanced CSS Selectors</h1>
47
+ <p>Learn about complex selectors.</p>
48
+ <a href="/post/advanced-selectors">Read More</a>
49
+ <ul class="tags">
50
+ <li>css</li>
51
+ <li>selectors</li>
52
+ </ul>
53
+ </article>
54
+
55
+ <section class="sidebar">
56
+ <h2>Popular Posts</h2>
57
+ <ul>
58
+ <li><a href="/popular/1">How to Use CSS Selectors</a></li>
59
+ <li><a href="/popular/2">HTML Parsing Guide</a></li>
60
+ </ul>
61
+ </section>
62
+
63
+ <table class="pricing">
64
+ <thead>
65
+ <tr><th>Plan</th><th>Price</th><th>Features</th></tr>
66
+ </thead>
67
+ <tbody>
68
+ <tr><td>Basic</td><td>$10/mo</td><td>100 requests</td></tr>
69
+ <tr><td>Pro</td><td>$30/mo</td><td>1000 requests</td></tr>
70
+ <tr><td>Enterprise</td><td>$100/mo</td><td>Unlimited</td></tr>
71
+ </tbody>
72
+ </table>
73
+
74
+ <form action="/search" method="get">
75
+ <input type="text" name="q" placeholder="Search...">
76
+ <button type="submit">Go</button>
77
+ </form>
78
+ </body>
79
+ </html>
80
+ `;
81
+
82
+ const doc = html.parse(HTML, { baseUrl: 'https://example.com/blog/' });
83
+
84
+ // ============================================================
85
+ // 1. Friendly Required/Optional Aliases
86
+ // ============================================================
87
+ console.log('=== 1. Friendly Aliases ===');
88
+
89
+ // firstOrThrow - like first_s but more readable
90
+ const firstArticle = doc.firstOrThrow('article');
91
+ console.log('firstOrThrow article text:', firstArticle.text('h1'));
92
+
93
+ // onlyOrThrow - returns single element or throws
94
+ const sidebar = doc.onlyOrThrow('.sidebar');
95
+ console.log('onlyOrThrow .sidebar heading:', sidebar.text('h2'));
96
+
97
+ // attrOrThrow - get attribute or throw
98
+ const firstLink = doc.firstOrThrow('a');
99
+ console.log('attrOrThrow href:', firstLink.attrOrThrow('href'));
100
+
101
+ // textOrThrow - find element and get text or throw
102
+ console.log('textOrThrow h1:', doc.textOrThrow('h1'));
103
+
104
+ // ============================================================
105
+ // 2. Selector-Scoped Convenience Methods
106
+ // ============================================================
107
+ console.log('\n=== 2. Convenience Methods ===');
108
+
109
+ // exists(selector) - check if any element matches
110
+ console.log('exists .featured:', doc.exists('.featured')); // true
111
+ console.log('exists .missing:', doc.exists('.missing')); // false
112
+
113
+ // count(selector) - count matching elements
114
+ console.log('count article:', doc.count('article')); // 2
115
+ console.log('count li:', doc.count('li')); // 5
116
+
117
+ // text(selector) - get text of first match (null if not found)
118
+ console.log('text h1:', JSON.stringify(doc.text('h1'))); // "Getting Started..."
119
+
120
+ // attr(selector, name) - get attribute of first match
121
+ console.log('attr meta[property] content:', doc.attr('meta[property="og:title"]', 'content'));
122
+
123
+ // attrOrThrow(selector, name) - get attribute or throw
124
+ console.log('attrOrThrow a href:', doc.attrOrThrow('a', 'href'));
125
+
126
+ // ============================================================
127
+ // 3. Text Normalization
128
+ // ============================================================
129
+ console.log('\n=== 3. Text Normalization ===');
130
+
131
+ const h1 = doc.firstOrThrow('h1');
132
+
133
+ // raw text (default)
134
+ console.log('raw text:', JSON.stringify(h1.text()));
135
+
136
+ // normalized (trimmed + collapsed whitespace)
137
+ console.log('normalized:', JSON.stringify(h1.text({ normalize: true })));
138
+
139
+ // separator (join descendant text with custom separator)
140
+ console.log('separator:', JSON.stringify(h1.text({ separator: ' | ' })));
141
+
142
+ // ============================================================
143
+ // 4. Traversal
144
+ // ============================================================
145
+ console.log('\n=== 4. Traversal ===');
146
+
147
+ // closest - find nearest ancestor matching selector
148
+ const firstH1 = doc.firstOrThrow('h1');
149
+ const article = firstH1.closest('article');
150
+ console.log('h1.closest(article) tag:', article.tagName);
151
+ console.log('h1.closest(article) class:', article.attr('class'));
152
+
153
+ // children - get direct element children (optionally filtered)
154
+ const allChildren = article.children();
155
+ console.log('article children count:', allChildren.length);
156
+
157
+ const headingChildren = article.children('h1');
158
+ console.log('article children(h1):', headingChildren.length);
159
+
160
+ // siblings - get sibling elements (optionally filtered)
161
+ const firstP = article.firstOrThrow('p');
162
+ const siblingElements = firstP.siblings();
163
+ console.log('p siblings count:', siblingElements.length);
164
+
165
+ const linkSiblings = firstP.siblings('a');
166
+ console.log('p siblings(a):', linkSiblings.length);
167
+
168
+ // matches / is - check if element matches a selector
169
+ console.log('h1 matches article h1:', firstH1.matches('article h1'));
170
+ console.log('h1 matches div:', firstH1.matches('div'));
171
+ console.log('h1 is(h1):', firstH1.is('h1'));
172
+
173
+ // ============================================================
174
+ // 5. Table Extraction
175
+ // ============================================================
176
+ console.log('\n=== 5. Table Extraction ===');
177
+
178
+ // rows() - extract table rows as objects
179
+ const pricingRows = doc.firstOrThrow('.pricing').rows();
180
+ console.log('pricing table rows:');
181
+ for (const row of pricingRows) {
182
+ console.log(` ${row.Plan}: ${row.Price} (${row.Features})`);
183
+ }
184
+
185
+ // doc.table() - convenience: find table and extract rows
186
+ const tableData = doc.table('.pricing');
187
+ console.log('doc.table direct:', JSON.stringify(tableData));
188
+
189
+ // ============================================================
190
+ // 6. Structured Extraction
191
+ // ============================================================
192
+ console.log('\n=== 6. Structured Extraction ===');
193
+
194
+ const extracted = doc.extract({
195
+ title: ['h1', 'text'],
196
+ canonicalUrl: ['link[rel="canonical"]', 'href'],
197
+ hasFeatured: ['.featured', 'exists'],
198
+
199
+ articles: ['article.post', {
200
+ heading: ['h1', 'text'],
201
+ link: ['a', 'href'],
202
+ hasImage: ['img', 'exists'],
203
+ tags: ['ul.tags li', 'text'],
204
+ }],
205
+ });
206
+
207
+ console.log(JSON.stringify(extracted, null, 2));
208
+
209
+ // Element-level extract
210
+ const articleEl = doc.firstOrThrow('article');
211
+ const articleData = articleEl.extract({
212
+ heading: ['h1', 'text'],
213
+ tagCount: ['li', 'count'],
214
+ });
215
+ console.log('elem.extract:', JSON.stringify(articleData));
216
+
217
+ // ============================================================
218
+ // 7. URL Resolution
219
+ // ============================================================
220
+ console.log('\n=== 7. URL Resolution ===');
221
+
222
+ // baseUrl resolves relative URLs in links() and images()
223
+ const links = doc.links();
224
+ console.log('links (with baseUrl):');
225
+ for (const l of links) {
226
+ console.log(` "${l.text}" -> ${l.href}`);
227
+ }
228
+
229
+ const imgs = doc.images();
230
+ console.log('images (with baseUrl):');
231
+ for (const img of imgs) {
232
+ console.log(` alt="${img.alt}" src="${img.src}"`);
233
+ }
234
+
235
+ // doc.url(selector, attr) - resolve a specific URL
236
+ console.log('url(a, href):', doc.url('a', 'href'));
237
+
238
+ // ============================================================
239
+ // 8. Common Extractors
240
+ // ============================================================
241
+ console.log('\n=== 8. Common Extractors ===');
242
+
243
+ // Page title
244
+ console.log('title():', doc.title());
245
+
246
+ // Meta description
247
+ console.log('description():', doc.description());
248
+
249
+ // Canonical URL
250
+ console.log('canonicalUrl():', doc.canonicalUrl());
251
+
252
+ // Meta tags object
253
+ console.log('meta():', JSON.stringify(doc.meta()));
254
+
255
+ // Forms
256
+ console.log('forms count:', doc.forms().length);
257
+
258
+ // Tables
259
+ console.log('tables count:', doc.tables().length);
260
+
261
+ // ============================================================
262
+ // 9. Worked Example: Real-World Scraping
263
+ // ============================================================
264
+ console.log('\n=== 9. Real-World Scraping Example ===');
265
+
266
+ /**
267
+ * Extract structured data from a blog page with one call.
268
+ */
269
+ function scrapeBlogPage(htmlContent, pageUrl) {
270
+ const doc = html.parse(htmlContent, { baseUrl: pageUrl });
271
+
272
+ return doc.extract({
273
+ // Page-level info
274
+ pageTitle: ['title', 'text'],
275
+ metaDescription: ['meta[name="description"]', 'content'],
276
+ ogImage: ['meta[property="og:image"]', 'content'],
277
+ canonicalUrl: ['link[rel="canonical"]', 'href'],
278
+
279
+ // Content
280
+ posts: ['article.post', {
281
+ heading: ['h1', 'text'],
282
+ link: ['a', 'href'],
283
+ hasImage: ['img', 'exists'],
284
+ isFeatured: ['.featured', 'exists'],
285
+ tagCount: ['ul.tags li', 'count'],
286
+ }],
287
+
288
+ // Sidebar links
289
+ sidebarLinks: ['.sidebar a', 'text'],
290
+
291
+ // Stats
292
+ totalArticles: ['article.post', 'count'],
293
+ hasForm: ['form', 'exists'],
294
+ hasPricing: ['.pricing', 'exists'],
295
+ });
296
+ }
297
+
298
+ const result = scrapeBlogPage(HTML, 'https://example.com/blog/');
299
+ console.log(JSON.stringify(result, null, 2));
300
+
301
+ console.log('\nAll examples completed successfully!');
package/index.d.ts CHANGED
@@ -1,11 +1,21 @@
1
1
  export declare type NodeType = 'DOCUMENT' | 'ELEMENT' | 'TEXT' | 'CDATA' | 'COMMENT' | 'WHITESPACE' | 'TEMPLATE' | 'UNKNOWN';
2
2
 
3
+ export declare type TextOptions = {
4
+ normalize?: boolean;
5
+ separator?: string;
6
+ };
7
+
8
+ export declare type ExtractSchema = {
9
+ [key: string]: [string, string | ExtractSchema | 'exists' | 'text' | 'count'];
10
+ };
11
+
3
12
  export declare type XElement = {
4
13
  childNodes: XElement[];
5
14
  nodeType: NodeType;
6
15
  parent: XElement | null;
7
16
  outerHTML: string;
8
17
  innerText: string;
18
+ textContent: string;
9
19
  tagName: string | null;
10
20
 
11
21
  attr: (name: string) => string | undefined;
@@ -13,24 +23,69 @@ export declare type XElement = {
13
23
  find: (selector: string) => XElement[];
14
24
  first: (selector: string) => XElement | null;
15
25
  first_s: (selector: string) => XElement;
26
+ firstOrThrow: (selector: string) => XElement;
16
27
  only: (selector: string) => XElement | null;
17
28
  only_s: (selector: string) => XElement;
29
+ onlyOrThrow: (selector: string) => XElement;
18
30
  hasClass: (name: string) => boolean;
19
31
  hasAttribute: (name: string) => boolean;
20
32
  prev: (selector?: string) => XElement | null;
21
33
  next: (selector?: string) => XElement | null;
34
+
35
+ // New methods
36
+ attrOrThrow: (name: string) => string;
37
+ text: ((opts?: TextOptions) => string) | ((selector: string, opts?: TextOptions) => string | null);
38
+ textOrThrow: (selector: string) => string;
39
+ exists: (selector: string) => boolean;
40
+ count: (selector: string) => number;
41
+ closest: (selector: string) => XElement | null;
42
+ children: (selector?: string) => XElement[];
43
+ siblings: (selector?: string) => XElement[];
44
+ matches: (selector: string) => boolean;
45
+ is: (selector: string) => boolean;
46
+ rows: () => Array<{ [header: string]: string }>;
47
+ urlAttr: (attrName: string) => string | undefined;
48
+ extract: (schema: ExtractSchema) => any;
22
49
  };
23
50
 
24
51
  export declare type XDocument = {
25
52
  documentElement: XElement;
26
- outerHTML: string;
27
53
  innerText: string;
54
+ textContent: string;
55
+ outerHTML: string;
28
56
  tagName: string | null;
29
- find: (selector: string) => XElement[]
57
+ nodeType: NodeType;
58
+
59
+ find: (selector: string) => XElement[];
30
60
  first: (selector: string) => XElement | null;
31
61
  first_s: (selector: string) => XElement;
62
+ firstOrThrow: (selector: string) => XElement;
32
63
  only: (selector: string) => XElement | null;
33
64
  only_s: (selector: string) => XElement;
65
+ onlyOrThrow: (selector: string) => XElement;
66
+
67
+ // New convenience methods
68
+ text: (selector: string, opts?: TextOptions) => string | null;
69
+ textOrThrow: (selector: string) => string;
70
+ attr: (selector: string, name: string) => string | undefined;
71
+ attrOrThrow: (selector: string, name: string) => string;
72
+ exists: (selector: string) => boolean;
73
+ count: (selector: string) => number;
74
+
75
+ // URL helpers
76
+ url: (selector: string, attr: string) => string | undefined;
77
+
78
+ // High-level extractors
79
+ extract: (schema: ExtractSchema) => any;
80
+ meta: () => { [key: string]: string };
81
+ links: () => Array<{ text: string; href: string }>;
82
+ images: () => Array<{ alt: string; src: string }>;
83
+ forms: () => XElement[];
84
+ tables: () => XElement[];
85
+ table: (selector?: string) => Array<{ [header: string]: string }>;
86
+ title: () => string | null;
87
+ description: () => string | undefined;
88
+ canonicalUrl: () => string | undefined;
34
89
  };
35
90
 
36
- export declare function parse(html: string): XDocument;
91
+ export declare function parse(html: string, options?: { baseUrl?: string }): XDocument;
package/index.js CHANGED
@@ -1,2 +1,7 @@
1
- const html = require('./html');
2
- module.exports = html;
1
+ 'use strict';
2
+
3
+ const bindings = require('bindings');
4
+ const html = bindings('html');
5
+ const { enhance } = require('./lib/wrapper');
6
+
7
+ module.exports = enhance(html);