@xcrap/extractor 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +131 -0
- package/dist/errors.d.ts +13 -0
- package/dist/errors.js +29 -0
- package/dist/html/extractor-model.d.ts +34 -0
- package/dist/html/extractor-model.js +71 -0
- package/dist/html/extractors.d.ts +61 -0
- package/dist/html/extractors.js +145 -0
- package/dist/html/index.d.ts +3 -0
- package/dist/html/index.js +19 -0
- package/dist/html/parser.d.ts +41 -0
- package/dist/html/parser.js +72 -0
- package/dist/index.d.ts +7 -0
- package/dist/index.js +23 -0
- package/dist/interfaces/extractor-model.d.ts +3 -0
- package/dist/interfaces/extractor-model.js +2 -0
- package/dist/json/extractor-model.d.ts +22 -0
- package/dist/json/extractor-model.js +97 -0
- package/dist/json/index.d.ts +2 -0
- package/dist/json/index.js +18 -0
- package/dist/json/parser.d.ts +6 -0
- package/dist/json/parser.js +48 -0
- package/dist/markdown/constants.d.ts +2 -0
- package/dist/markdown/constants.js +11 -0
- package/dist/markdown/extractor-model.d.ts +4 -0
- package/dist/markdown/extractor-model.js +17 -0
- package/dist/markdown/index.d.ts +2 -0
- package/dist/markdown/index.js +18 -0
- package/dist/markdown/parser.d.ts +5 -0
- package/dist/markdown/parser.js +18 -0
- package/dist/query-builders/html.d.ts +6 -0
- package/dist/query-builders/html.js +16 -0
- package/dist/query-builders/index.d.ts +1 -0
- package/dist/query-builders/index.js +17 -0
- package/dist/source-parser.d.ts +13 -0
- package/dist/source-parser.js +22 -0
- package/dist/utils/index.d.ts +3 -0
- package/dist/utils/index.js +19 -0
- package/dist/utils/select-first-element.d.ts +3 -0
- package/dist/utils/select-first-element.js +60 -0
- package/dist/utils/select-many-elements.d.ts +3 -0
- package/dist/utils/select-many-elements.js +57 -0
- package/dist/utils/slugify.d.ts +1 -0
- package/dist/utils/slugify.js +10 -0
- package/package.json +64 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Marcuth
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
# 🕷️ Xcrap Extractor: Parsing HTML and JSON using declarative models
|
|
2
|
+
|
|
3
|
+
> Note: Xcrap Parser is now Xcrap Extractor
|
|
4
|
+
|
|
5
|
+
Xcrap Extractor is a package of the Xcrap framework, it was developed to take care of the data extraction part of text files (currently supporting only HTML, JSON and Markdown) using declarative models.
|
|
6
|
+
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
## 📦 Installation
|
|
10
|
+
|
|
11
|
+
Installing it is very simple, you can use NPM or any other package manager of your choice: such as PNPM, Yarn, etc.
|
|
12
|
+
|
|
13
|
+
```cmd
|
|
14
|
+
npm i @xcrap/extractor
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
---
|
|
18
|
+
|
|
19
|
+
## 🛠️ How to Use
|
|
20
|
+
|
|
21
|
+
Well, there are a few ways to use this parsing mechanism, from using the already created templates to expanding it by creating parsers for other file types and keeping the interleaving of these templates.
|
|
22
|
+
|
|
23
|
+
### Providing HTML string
|
|
24
|
+
|
|
25
|
+
```ts
|
|
26
|
+
import { HtmlParser, extract } from "@xcrap/extractor"
|
|
27
|
+
|
|
28
|
+
;(async () => {
|
|
29
|
+
const html = "<html><head><title>Page Title</title></head><body><><></body></html>"
|
|
30
|
+
const parser = new HtmlParser(html)
|
|
31
|
+
})();
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
### Loading an HTML file
|
|
35
|
+
|
|
36
|
+
```ts
|
|
37
|
+
import { HtmlParser, extract } from "@xcrap/extractor"
|
|
38
|
+
|
|
39
|
+
;(async () => {
|
|
40
|
+
const parser = await HtmlParser.loadFile("./path-to-html-file.html", { encoding: "utf-8" }) // Returns an instance of HtmlParser
|
|
41
|
+
})();
|
|
42
|
+
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
### Data extraction without using models
|
|
46
|
+
|
|
47
|
+
```ts
|
|
48
|
+
import { HtmlParser, extract, css } from "@xcrap/extractor"
|
|
49
|
+
|
|
50
|
+
;(async () => {
|
|
51
|
+
const html = `<html><head><title>Page Title</title></head><body><a href="https://example.com">Link</a></body></html>`
|
|
52
|
+
const parser = new HtmlParser(html)
|
|
53
|
+
|
|
54
|
+
// extractValue() searches and extracts something from the first element found
|
|
55
|
+
// extract(key: string, isAttribute?: boolean) is a generic extraction function, you can use some that are already created and ready to use by importing them from the same location :)
|
|
56
|
+
const title = await parser.extractValue({ query: css("title"), extractor: extract("innerText") })
|
|
57
|
+
|
|
58
|
+
// extractValues() fetches all the elements it finds with a query (you can limit the number of results) and uses the extractor to grab the data
|
|
59
|
+
const links = await parser.extractValues({ query: css("a"), extractor: extract("href", true) })
|
|
60
|
+
|
|
61
|
+
console.log(title) // "Page Title"
|
|
62
|
+
console.log(links) // ["https://example.com"]
|
|
63
|
+
})();
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
### Data extraction with using models
|
|
67
|
+
|
|
68
|
+
ExtractorModels are decoupled enough that you don't have to rely on using SourceParser instances, but we'll still use them:
|
|
69
|
+
|
|
70
|
+
```ts
|
|
71
|
+
import { HtmlParser, HtmlExtrctorModel, extract, css } from "@xcrap/extractor"
|
|
72
|
+
|
|
73
|
+
;(async () => {
|
|
74
|
+
const html = `<html><body><h1>Heading</h1><div><p id="id">1</p><p id="name">Name</p><p class="age">23</p></div></body></html>`
|
|
75
|
+
const parser = new HtmlParser(html)
|
|
76
|
+
|
|
77
|
+
const rootModel = new HtmlExtrctorModel({
|
|
78
|
+
heading: {
|
|
79
|
+
query: css("h1"),
|
|
80
|
+
extractor: extract("innerText")
|
|
81
|
+
},
|
|
82
|
+
id: {
|
|
83
|
+
// You can also use xpath() query builder
|
|
84
|
+
// query: xpath("//*[@id='id']")
|
|
85
|
+
query: css("#id"),
|
|
86
|
+
extractor: extract("innerText")
|
|
87
|
+
},
|
|
88
|
+
name: {
|
|
89
|
+
query: css("#name"),
|
|
90
|
+
extractor: extract("innerText")
|
|
91
|
+
},
|
|
92
|
+
age: {
|
|
93
|
+
query: css(".age"),
|
|
94
|
+
extractor: extract("innerText")
|
|
95
|
+
}
|
|
96
|
+
})
|
|
97
|
+
|
|
98
|
+
const data = await parser.extractModel({ model: rootModel })
|
|
99
|
+
|
|
100
|
+
console.log(data) // { heading: "Heading", id: "1", name: "Name", age: "23" }
|
|
101
|
+
})();
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
## 🧠 Create your own Parser: Concepts
|
|
105
|
+
|
|
106
|
+
### What is a SourceParser?
|
|
107
|
+
|
|
108
|
+
A SourceParser for this library is a class that deals in some way with a file type, loads that file, and may or may not have some methods to easily extract data.
|
|
109
|
+
|
|
110
|
+
A parser has a default method called `extractWithModel` which is a wrapper that takes a `ExtractorModel` and calls the `extract()` method providing the internal `source` property.
|
|
111
|
+
|
|
112
|
+
### What is a ExtractorModel?
|
|
113
|
+
|
|
114
|
+
A Extractor Model is a class that receives a `shape` in its constructor and stores it as a property. It must have a method called `extract()` that will receive a `source`, which is the code/text that contains the information to be extracted.
|
|
115
|
+
|
|
116
|
+
This `shape` is used to declare the form in which the information will be extracted from the `source`.
|
|
117
|
+
|
|
118
|
+
## 🤝 Contributing
|
|
119
|
+
|
|
120
|
+
**We are actively looking for contributors!** Whether you are a beginner or an experienced developer, your help is welcome. We have many tasks available, from simple documentation fixes to complex parser implementations.
|
|
121
|
+
|
|
122
|
+
- Want to contribute? Follow these steps:
|
|
123
|
+
- Fork the repository.
|
|
124
|
+
- Create a new branch (git checkout -b feature-new).
|
|
125
|
+
- Commit your changes (git commit -m 'Add new feature').
|
|
126
|
+
- Push to the branch (git push origin feature-new).
|
|
127
|
+
- Open a Pull Request.
|
|
128
|
+
|
|
129
|
+
## 📝 License
|
|
130
|
+
|
|
131
|
+
This project is licensed under the MIT License.
|
package/dist/errors.d.ts
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import { BuildedQuery } from "./query-builders";
|
|
2
|
+
export declare class HTMLElementNotFoundError extends Error {
|
|
3
|
+
constructor(query?: BuildedQuery);
|
|
4
|
+
}
|
|
5
|
+
export declare class MultipleQueryError extends Error {
|
|
6
|
+
constructor();
|
|
7
|
+
}
|
|
8
|
+
export declare class FieldNotFoundError extends Error {
|
|
9
|
+
constructor(key: string);
|
|
10
|
+
}
|
|
11
|
+
export declare class ExtractorNotFoundError extends Error {
|
|
12
|
+
constructor(name: string);
|
|
13
|
+
}
|
package/dist/errors.js
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.ExtractorNotFoundError = exports.FieldNotFoundError = exports.MultipleQueryError = exports.HTMLElementNotFoundError = void 0;
|
|
4
|
+
class HTMLElementNotFoundError extends Error {
|
|
5
|
+
constructor(query) {
|
|
6
|
+
super(`Element with query "${JSON.stringify(query) || "no query provided"}" not found`);
|
|
7
|
+
this.name = "HTMLElementNotFoundError";
|
|
8
|
+
}
|
|
9
|
+
}
|
|
10
|
+
exports.HTMLElementNotFoundError = HTMLElementNotFoundError;
|
|
11
|
+
class MultipleQueryError extends Error {
|
|
12
|
+
constructor() {
|
|
13
|
+
super("Multiple value must have a 'query'");
|
|
14
|
+
this.name = "MultipleQueryError";
|
|
15
|
+
}
|
|
16
|
+
}
|
|
17
|
+
exports.MultipleQueryError = MultipleQueryError;
|
|
18
|
+
class FieldNotFoundError extends Error {
|
|
19
|
+
constructor(key) {
|
|
20
|
+
super(`Field with key "${key}" not found`);
|
|
21
|
+
}
|
|
22
|
+
}
|
|
23
|
+
exports.FieldNotFoundError = FieldNotFoundError;
|
|
24
|
+
class ExtractorNotFoundError extends Error {
|
|
25
|
+
constructor(name) {
|
|
26
|
+
super(`Extractor with name "${name}" not found`);
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
exports.ExtractorNotFoundError = ExtractorNotFoundError;
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
import { HTMLElement, Options as NodeHtmlOptions } from "node-html-parser";
|
|
2
|
+
import { ExtractorModel } from "../interfaces/extractor-model";
|
|
3
|
+
import { BuildedQuery } from "../query-builders";
|
|
4
|
+
import { ExtractorFunction } from "./extractors";
|
|
5
|
+
export type HtmlExtrctorModelShapeBaseValue = {
|
|
6
|
+
query?: BuildedQuery;
|
|
7
|
+
default?: string | string[] | null;
|
|
8
|
+
multiple?: boolean;
|
|
9
|
+
limit?: number;
|
|
10
|
+
extractor: ExtractorFunction;
|
|
11
|
+
};
|
|
12
|
+
export type HtmlExtrctorModelShapeNestedValue = {
|
|
13
|
+
query: BuildedQuery;
|
|
14
|
+
limit?: number;
|
|
15
|
+
multiple?: boolean;
|
|
16
|
+
model: ExtractorModel;
|
|
17
|
+
extractor?: ExtractorFunction;
|
|
18
|
+
};
|
|
19
|
+
export type HtmlExtrctorModelValue = HtmlExtrctorModelShapeBaseValue | HtmlExtrctorModelShapeNestedValue;
|
|
20
|
+
export type HtmlExtrctorModelShape = {
|
|
21
|
+
[key: string]: HtmlExtrctorModelValue;
|
|
22
|
+
};
|
|
23
|
+
export type InferHtmlValue<V extends HtmlExtrctorModelValue> = V extends HtmlExtrctorModelShapeNestedValue ? V["multiple"] extends true ? V["model"] extends ExtractorModel<infer M> ? M[] : any : V["model"] extends ExtractorModel<infer M> ? M : any : V extends HtmlExtrctorModelShapeBaseValue ? V["multiple"] extends true ? Awaited<ReturnType<V["extractor"]>>[] : Awaited<ReturnType<V["extractor"]>> : never;
|
|
24
|
+
export type InferHtmlShape<S extends HtmlExtrctorModelShape> = {
|
|
25
|
+
[K in keyof S]: InferHtmlValue<S[K]>;
|
|
26
|
+
};
|
|
27
|
+
export type ParseBaseValueReturnType = (undefined | string)[] | string | null | undefined;
|
|
28
|
+
export declare class HtmlExtrctorModel<S extends HtmlExtrctorModelShape> implements ExtractorModel<InferHtmlShape<S>> {
|
|
29
|
+
readonly shape: S;
|
|
30
|
+
constructor(shape: S);
|
|
31
|
+
extract(source: string, options?: NodeHtmlOptions): Promise<InferHtmlShape<S>>;
|
|
32
|
+
protected extractBaseValue(value: HtmlExtrctorModelShapeBaseValue, root: HTMLElement): Promise<ParseBaseValueReturnType>;
|
|
33
|
+
protected extractNestedValue(value: HtmlExtrctorModelShapeNestedValue, root: HTMLElement): Promise<any>;
|
|
34
|
+
}
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
3
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
4
|
+
};
|
|
5
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
+
exports.HtmlExtrctorModel = void 0;
|
|
7
|
+
const node_html_parser_1 = __importDefault(require("node-html-parser"));
|
|
8
|
+
const errors_1 = require("../errors");
|
|
9
|
+
const utils_1 = require("../utils");
|
|
10
|
+
const parser_1 = require("./parser");
|
|
11
|
+
class HtmlExtrctorModel {
|
|
12
|
+
constructor(shape) {
|
|
13
|
+
this.shape = shape;
|
|
14
|
+
}
|
|
15
|
+
async extract(source, options = parser_1.nodeHtmlParserOptions) {
|
|
16
|
+
const root = node_html_parser_1.default.parse(source, options);
|
|
17
|
+
const data = {};
|
|
18
|
+
for (const key in this.shape) {
|
|
19
|
+
const value = this.shape[key];
|
|
20
|
+
const isNestedValue = "model" in value;
|
|
21
|
+
if (isNestedValue) {
|
|
22
|
+
data[key] = await this.extractNestedValue(value, root);
|
|
23
|
+
}
|
|
24
|
+
else {
|
|
25
|
+
data[key] = await this.extractBaseValue(value, root);
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
return data;
|
|
29
|
+
}
|
|
30
|
+
async extractBaseValue(value, root) {
|
|
31
|
+
if (value.multiple) {
|
|
32
|
+
if (!value.query) {
|
|
33
|
+
throw new errors_1.MultipleQueryError();
|
|
34
|
+
}
|
|
35
|
+
const elements = (0, utils_1.selectManyElements)(value.query, root);
|
|
36
|
+
if (value.limit !== undefined) {
|
|
37
|
+
elements.splice(value.limit);
|
|
38
|
+
}
|
|
39
|
+
return await Promise.all(elements.map((element) => value.extractor(element)));
|
|
40
|
+
}
|
|
41
|
+
else {
|
|
42
|
+
const element = value.query ? (0, utils_1.selectFirstElement)(value.query, root) : root;
|
|
43
|
+
if (!element) {
|
|
44
|
+
if (value.default === undefined) {
|
|
45
|
+
throw new errors_1.HTMLElementNotFoundError(value.query);
|
|
46
|
+
}
|
|
47
|
+
return value.default;
|
|
48
|
+
}
|
|
49
|
+
return await value.extractor(element);
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
async extractNestedValue(value, root) {
|
|
53
|
+
if (value.multiple) {
|
|
54
|
+
const elements = (0, utils_1.selectManyElements)(value.query, root);
|
|
55
|
+
if (value.limit !== undefined) {
|
|
56
|
+
elements.splice(value.limit);
|
|
57
|
+
}
|
|
58
|
+
return await Promise.all(elements.map((element) => value.model.extract(element.outerHTML)));
|
|
59
|
+
}
|
|
60
|
+
else {
|
|
61
|
+
const element = (0, utils_1.selectFirstElement)(value.query, root);
|
|
62
|
+
if (!element) {
|
|
63
|
+
throw new errors_1.HTMLElementNotFoundError(value.query);
|
|
64
|
+
}
|
|
65
|
+
const source = value.extractor ? (await value.extractor(element)) : element.outerHTML;
|
|
66
|
+
const data = await value.model.extract(source);
|
|
67
|
+
return data;
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
exports.HtmlExtrctorModel = HtmlExtrctorModel;
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
import { Attributes, RawAttributes } from "node-html-parser/dist/nodes/html";
|
|
2
|
+
import { HTMLElement, NodeType } from "node-html-parser";
|
|
3
|
+
export type ExtractorFunctionReturnType = (string | undefined) | Promise<string | undefined>;
|
|
4
|
+
export type ExtractorFunction<T = ExtractorFunctionReturnType> = (element: HTMLElement) => T;
|
|
5
|
+
export type HtmlProperty = "innerText" | "textContent" | "text" | "innerHTML" | "outerHTML" | "tagName" | "classList" | "classNames" | "id" | "childElementCount" | "structure" | "structuredText" | "attributes" | "attrs" | "localName" | "nodeType" | "range" | "rawAttributes" | "rawAttrs" | "rawTagName" | "rawText";
|
|
6
|
+
export type HtmlAttribute = "href" | "src" | "value" | "style" | "role" | "alt" | "title" | "placeholder" | "disabled" | "readonly" | "checked" | "selected" | "name" | "type" | "autocomplete" | "maxlength" | "minlength" | "pattern" | "required" | "aria-label" | "aria-hidden" | "aria-expanded" | "aria-checked" | "aria-disabled" | "data-*" | `data-${string}` | (string & {});
|
|
7
|
+
export declare const propertyExtractors: Record<HtmlProperty, (element: HTMLElement) => unknown | undefined>;
|
|
8
|
+
export declare function extract<T extends HtmlProperty | HtmlAttribute, R = string>(key: T, isAttribute?: boolean): ExtractorFunction<R | undefined>;
|
|
9
|
+
export declare const extractInnerText: ExtractorFunction<string | undefined>;
|
|
10
|
+
export declare const extractTextContent: ExtractorFunction<string | undefined>;
|
|
11
|
+
export declare const extractText: ExtractorFunction<string | undefined>;
|
|
12
|
+
export declare const extractInnerHtml: ExtractorFunction<string | undefined>;
|
|
13
|
+
export declare const extractOuterHtml: ExtractorFunction<string | undefined>;
|
|
14
|
+
export declare const extractTagName: ExtractorFunction<string | undefined>;
|
|
15
|
+
export declare const extractClassList: ExtractorFunction<string[] | undefined>;
|
|
16
|
+
export declare const extractId: ExtractorFunction<string | undefined>;
|
|
17
|
+
export declare const extractHref: ExtractorFunction<string | undefined>;
|
|
18
|
+
export declare const extractSrc: ExtractorFunction<string | undefined>;
|
|
19
|
+
export declare const extractValue: ExtractorFunction<string | undefined>;
|
|
20
|
+
export declare const extractStyle: ExtractorFunction<string | undefined>;
|
|
21
|
+
export declare const extractRole: ExtractorFunction<string | undefined>;
|
|
22
|
+
export declare const extractTitle: ExtractorFunction<string | undefined>;
|
|
23
|
+
export declare const extractPlaceholder: ExtractorFunction<string | undefined>;
|
|
24
|
+
export declare const extractDisabled: ExtractorFunction<string | undefined>;
|
|
25
|
+
export declare const extractReadonly: ExtractorFunction<string | undefined>;
|
|
26
|
+
export declare const extractChecked: ExtractorFunction<string | undefined>;
|
|
27
|
+
export declare const extractSelected: ExtractorFunction<string | undefined>;
|
|
28
|
+
export declare const extractName: ExtractorFunction<string | undefined>;
|
|
29
|
+
export declare const extractType: ExtractorFunction<string | undefined>;
|
|
30
|
+
export declare const extractAutocomplete: ExtractorFunction<string | undefined>;
|
|
31
|
+
export declare const extractMaxLength: ExtractorFunction<string | undefined>;
|
|
32
|
+
export declare const extractMinLength: ExtractorFunction<string | undefined>;
|
|
33
|
+
export declare const extractPattern: ExtractorFunction<string | undefined>;
|
|
34
|
+
export declare const extractRequired: ExtractorFunction<string | undefined>;
|
|
35
|
+
export declare const extractAriaLabel: ExtractorFunction<string | undefined>;
|
|
36
|
+
export declare const extractAriaHidden: ExtractorFunction<string | undefined>;
|
|
37
|
+
export declare const extractAriaExpanded: ExtractorFunction<string | undefined>;
|
|
38
|
+
export declare const extractAriaChecked: ExtractorFunction<string | undefined>;
|
|
39
|
+
export declare const extractAriaDisabled: ExtractorFunction<string | undefined>;
|
|
40
|
+
export declare const extractAllData: ExtractorFunction<string | undefined>;
|
|
41
|
+
export declare const extractAttribute: <T extends string>(name: T) => ExtractorFunction<string | undefined>;
|
|
42
|
+
export declare const extractRange: ExtractorFunction<[number, number] | undefined>;
|
|
43
|
+
export declare const extarctAttributes: ExtractorFunction<Record<string, string> | undefined>;
|
|
44
|
+
export declare const extractAttrs: ExtractorFunction<Attributes | undefined>;
|
|
45
|
+
export declare const extractChildElementCount: ExtractorFunction<number | undefined>;
|
|
46
|
+
export declare const extractClassNames: ExtractorFunction<string | undefined>;
|
|
47
|
+
export declare const extractLocalName: ExtractorFunction<string | undefined>;
|
|
48
|
+
export declare const extractNodeType: ExtractorFunction<NodeType | undefined>;
|
|
49
|
+
export declare const extractRawTagName: ExtractorFunction<string | undefined>;
|
|
50
|
+
export declare const extractRawText: ExtractorFunction<string | undefined>;
|
|
51
|
+
export declare const extractStructure: ExtractorFunction<string | undefined>;
|
|
52
|
+
export declare const extractStructuredText: ExtractorFunction<string | undefined>;
|
|
53
|
+
export declare const extarctRawAttributes: ExtractorFunction<RawAttributes | undefined>;
|
|
54
|
+
export declare const extractRawAttrs: ExtractorFunction<string | undefined>;
|
|
55
|
+
export type FromNextOrPreviousElementSiblingOptions = {
|
|
56
|
+
shouldExists?: boolean;
|
|
57
|
+
};
|
|
58
|
+
export declare const fromNextElementSibling: (extractor: ExtractorFunction, { shouldExists }?: FromNextOrPreviousElementSiblingOptions) => ExtractorFunction;
|
|
59
|
+
export declare const fromPreviousElementSibling: (extractor: ExtractorFunction, { shouldExists }?: FromNextOrPreviousElementSiblingOptions) => ExtractorFunction;
|
|
60
|
+
export type RegexExtractor = (pattern: RegExp, index?: number) => ExtractorFunction<string | string[]>;
|
|
61
|
+
export declare const matchRegexFrom: (extractor: ExtractorFunction<string | undefined>, pattern: RegExp, index?: number) => ExtractorFunction<string | string[] | undefined>;
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.matchRegexFrom = exports.fromPreviousElementSibling = exports.fromNextElementSibling = exports.extractRawAttrs = exports.extarctRawAttributes = exports.extractStructuredText = exports.extractStructure = exports.extractRawText = exports.extractRawTagName = exports.extractNodeType = exports.extractLocalName = exports.extractClassNames = exports.extractChildElementCount = exports.extractAttrs = exports.extarctAttributes = exports.extractRange = exports.extractAttribute = exports.extractAllData = exports.extractAriaDisabled = exports.extractAriaChecked = exports.extractAriaExpanded = exports.extractAriaHidden = exports.extractAriaLabel = exports.extractRequired = exports.extractPattern = exports.extractMinLength = exports.extractMaxLength = exports.extractAutocomplete = exports.extractType = exports.extractName = exports.extractSelected = exports.extractChecked = exports.extractReadonly = exports.extractDisabled = exports.extractPlaceholder = exports.extractTitle = exports.extractRole = exports.extractStyle = exports.extractValue = exports.extractSrc = exports.extractHref = exports.extractId = exports.extractClassList = exports.extractTagName = exports.extractOuterHtml = exports.extractInnerHtml = exports.extractText = exports.extractTextContent = exports.extractInnerText = exports.propertyExtractors = void 0;
|
|
4
|
+
exports.extract = extract;
|
|
5
|
+
const errors_1 = require("../errors");
|
|
6
|
+
const htmlProperties = [
|
|
7
|
+
"innerText",
|
|
8
|
+
"textContent",
|
|
9
|
+
"text",
|
|
10
|
+
"innerHTML",
|
|
11
|
+
"outerHTML",
|
|
12
|
+
"tagName",
|
|
13
|
+
"classList",
|
|
14
|
+
"classNames",
|
|
15
|
+
"id",
|
|
16
|
+
"childElementCount",
|
|
17
|
+
"structure",
|
|
18
|
+
"structuredText",
|
|
19
|
+
"attributes",
|
|
20
|
+
"attrs",
|
|
21
|
+
"localName",
|
|
22
|
+
"nodeType",
|
|
23
|
+
"range",
|
|
24
|
+
"rawAttributes",
|
|
25
|
+
"rawAttrs",
|
|
26
|
+
"rawTagName",
|
|
27
|
+
"rawText",
|
|
28
|
+
];
|
|
29
|
+
exports.propertyExtractors = {
|
|
30
|
+
innerText: (element) => element.innerText,
|
|
31
|
+
textContent: (element) => element.textContent,
|
|
32
|
+
text: (element) => element.text,
|
|
33
|
+
innerHTML: (element) => element.innerHTML,
|
|
34
|
+
outerHTML: (element) => element.outerHTML,
|
|
35
|
+
tagName: (element) => element.tagName,
|
|
36
|
+
classList: (element) => Array.from(element.classList.values()),
|
|
37
|
+
id: (element) => element.id,
|
|
38
|
+
attributes: (element) => element.attributes,
|
|
39
|
+
attrs: (element) => element.attrs,
|
|
40
|
+
childElementCount: (element) => element.childElementCount,
|
|
41
|
+
classNames: (element) => element.classNames,
|
|
42
|
+
localName: (element) => element.localName,
|
|
43
|
+
nodeType: (element) => element.localName,
|
|
44
|
+
range: (element) => element.range,
|
|
45
|
+
rawAttributes: (element) => element.rawAttributes,
|
|
46
|
+
rawAttrs: (element) => element.rawAttrs,
|
|
47
|
+
rawTagName: (element) => element.rawTagName,
|
|
48
|
+
rawText: (element) => element.rawText,
|
|
49
|
+
structure: (element) => element.structure,
|
|
50
|
+
structuredText: (element) => element.structuredText,
|
|
51
|
+
};
|
|
52
|
+
function extract(key, isAttribute = false) {
|
|
53
|
+
return (element) => {
|
|
54
|
+
if (isAttribute || !htmlProperties.includes(key)) {
|
|
55
|
+
return element.getAttribute(key);
|
|
56
|
+
}
|
|
57
|
+
const extractor = exports.propertyExtractors[key];
|
|
58
|
+
if (!extractor) {
|
|
59
|
+
throw new errors_1.ExtractorNotFoundError(key);
|
|
60
|
+
}
|
|
61
|
+
return extractor(element);
|
|
62
|
+
};
|
|
63
|
+
}
|
|
64
|
+
exports.extractInnerText = extract("innerText");
|
|
65
|
+
exports.extractTextContent = extract("textContent");
|
|
66
|
+
exports.extractText = extract("text");
|
|
67
|
+
exports.extractInnerHtml = extract("innerHTML");
|
|
68
|
+
exports.extractOuterHtml = extract("outerHTML");
|
|
69
|
+
exports.extractTagName = extract("tagName");
|
|
70
|
+
exports.extractClassList = extract("classList");
|
|
71
|
+
exports.extractId = extract("id");
|
|
72
|
+
exports.extractHref = extract("href", true);
|
|
73
|
+
exports.extractSrc = extract("src", true);
|
|
74
|
+
exports.extractValue = extract("value", true);
|
|
75
|
+
exports.extractStyle = extract("style", true);
|
|
76
|
+
exports.extractRole = extract("role", true);
|
|
77
|
+
exports.extractTitle = extract("title", true);
|
|
78
|
+
exports.extractPlaceholder = extract("placeholder", true);
|
|
79
|
+
exports.extractDisabled = extract("disabled", true);
|
|
80
|
+
exports.extractReadonly = extract("readonly", true);
|
|
81
|
+
exports.extractChecked = extract("checked", true);
|
|
82
|
+
exports.extractSelected = extract("selected", true);
|
|
83
|
+
exports.extractName = extract("name", true);
|
|
84
|
+
exports.extractType = extract("type", true);
|
|
85
|
+
exports.extractAutocomplete = extract("autocomplete", true);
|
|
86
|
+
exports.extractMaxLength = extract("maxlength", true);
|
|
87
|
+
exports.extractMinLength = extract("minlength", true);
|
|
88
|
+
exports.extractPattern = extract("pattern", true);
|
|
89
|
+
exports.extractRequired = extract("required", true);
|
|
90
|
+
exports.extractAriaLabel = extract("aria-label", true);
|
|
91
|
+
exports.extractAriaHidden = extract("aria-hidden", true);
|
|
92
|
+
exports.extractAriaExpanded = extract("aria-expanded", true);
|
|
93
|
+
exports.extractAriaChecked = extract("aria-checked", true);
|
|
94
|
+
exports.extractAriaDisabled = extract("aria-disabled", true);
|
|
95
|
+
exports.extractAllData = extract("data-*", true);
|
|
96
|
+
const extractAttribute = (name) => extract(name, true);
|
|
97
|
+
exports.extractAttribute = extractAttribute;
|
|
98
|
+
exports.extractRange = extract("range");
|
|
99
|
+
exports.extarctAttributes = extract("attributes");
|
|
100
|
+
exports.extractAttrs = extract("attrs");
|
|
101
|
+
exports.extractChildElementCount = extract("childElementCount");
|
|
102
|
+
exports.extractClassNames = extract("classNames");
|
|
103
|
+
exports.extractLocalName = extract("localName");
|
|
104
|
+
exports.extractNodeType = extract("nodeType");
|
|
105
|
+
exports.extractRawTagName = extract("rawTagName");
|
|
106
|
+
exports.extractRawText = extract("rawText");
|
|
107
|
+
exports.extractStructure = extract("structure");
|
|
108
|
+
exports.extractStructuredText = extract("structuredText");
|
|
109
|
+
exports.extarctRawAttributes = extract("rawAttributes");
|
|
110
|
+
exports.extractRawAttrs = extract("rawAttrs");
|
|
111
|
+
const fromNextElementSibling = (extractor, { shouldExists } = { shouldExists: true }) => {
|
|
112
|
+
return (element) => {
|
|
113
|
+
const nextElementSibling = element.nextElementSibling;
|
|
114
|
+
if (!nextElementSibling) {
|
|
115
|
+
if (shouldExists) {
|
|
116
|
+
throw new errors_1.HTMLElementNotFoundError();
|
|
117
|
+
}
|
|
118
|
+
return undefined;
|
|
119
|
+
}
|
|
120
|
+
return extractor(nextElementSibling);
|
|
121
|
+
};
|
|
122
|
+
};
|
|
123
|
+
exports.fromNextElementSibling = fromNextElementSibling;
|
|
124
|
+
const fromPreviousElementSibling = (extractor, { shouldExists } = { shouldExists: true }) => {
|
|
125
|
+
return (element) => {
|
|
126
|
+
const previousElementSibling = element.previousElementSibling;
|
|
127
|
+
if (!previousElementSibling) {
|
|
128
|
+
if (shouldExists) {
|
|
129
|
+
throw new errors_1.HTMLElementNotFoundError();
|
|
130
|
+
}
|
|
131
|
+
return undefined;
|
|
132
|
+
}
|
|
133
|
+
return extractor(previousElementSibling);
|
|
134
|
+
};
|
|
135
|
+
};
|
|
136
|
+
exports.fromPreviousElementSibling = fromPreviousElementSibling;
|
|
137
|
+
const matchRegexFrom = (extractor, pattern, index) => {
|
|
138
|
+
return (element) => {
|
|
139
|
+
var _a;
|
|
140
|
+
const value = (_a = extractor(element)) !== null && _a !== void 0 ? _a : "";
|
|
141
|
+
const matches = Array.from(value.matchAll(pattern), (match) => { var _a; return (_a = match[1]) !== null && _a !== void 0 ? _a : match[0]; });
|
|
142
|
+
return index !== undefined ? matches[index] : matches;
|
|
143
|
+
};
|
|
144
|
+
};
|
|
145
|
+
exports.matchRegexFrom = matchRegexFrom;
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __exportStar = (this && this.__exportStar) || function(m, exports) {
|
|
14
|
+
for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
|
|
15
|
+
};
|
|
16
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
17
|
+
__exportStar(require("./extractor-model"), exports);
|
|
18
|
+
__exportStar(require("./parser"), exports);
|
|
19
|
+
__exportStar(require("./extractors"), exports);
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
import { HTMLElement, Options as NodeHtmlOptions } from "node-html-parser";
|
|
2
|
+
import { ExtractorModel } from "../interfaces/extractor-model";
|
|
3
|
+
import { BuildedQuery } from "../query-builders";
|
|
4
|
+
import { ExtractorFunction } from "./extractors";
|
|
5
|
+
import { SourceParser } from "../source-parser";
|
|
6
|
+
export type ExtractValuesOptions = {
|
|
7
|
+
query: BuildedQuery;
|
|
8
|
+
extractor: ExtractorFunction;
|
|
9
|
+
limit?: number;
|
|
10
|
+
};
|
|
11
|
+
export type ExtractValueOptions = {
|
|
12
|
+
query?: BuildedQuery;
|
|
13
|
+
extractor: ExtractorFunction;
|
|
14
|
+
default?: string | null;
|
|
15
|
+
};
|
|
16
|
+
export type ExtractModelOptions<T = any> = {
|
|
17
|
+
query?: BuildedQuery;
|
|
18
|
+
model: ExtractorModel<T>;
|
|
19
|
+
};
|
|
20
|
+
export type ExtractModelsOptions<T = any> = {
|
|
21
|
+
query: BuildedQuery;
|
|
22
|
+
model: ExtractorModel<T>;
|
|
23
|
+
limit?: number;
|
|
24
|
+
};
|
|
25
|
+
export declare const nodeHtmlParserOptions: {
|
|
26
|
+
blockTextElements: {
|
|
27
|
+
script: true;
|
|
28
|
+
noscript: true;
|
|
29
|
+
style: true;
|
|
30
|
+
code: true;
|
|
31
|
+
};
|
|
32
|
+
};
|
|
33
|
+
export declare class HtmlParser extends SourceParser {
|
|
34
|
+
readonly source: string;
|
|
35
|
+
readonly root: HTMLElement;
|
|
36
|
+
constructor(source: string, options?: NodeHtmlOptions);
|
|
37
|
+
extractValues({ query, extractor, limit }: ExtractValuesOptions): Promise<(string | undefined)[]>;
|
|
38
|
+
extractValue({ query, extractor, default: default_ }: ExtractValueOptions): Promise<any | undefined | null>;
|
|
39
|
+
extractModel<T>({ model, query }: ExtractModelOptions<T>): Promise<T>;
|
|
40
|
+
extractModels<T>({ model, query, limit }: ExtractModelsOptions<T>): Promise<T[]>;
|
|
41
|
+
}
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
3
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
4
|
+
};
|
|
5
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
+
exports.HtmlParser = exports.nodeHtmlParserOptions = void 0;
|
|
7
|
+
const node_html_parser_1 = __importDefault(require("node-html-parser"));
|
|
8
|
+
const utils_1 = require("../utils");
|
|
9
|
+
const errors_1 = require("../errors");
|
|
10
|
+
const source_parser_1 = require("../source-parser");
|
|
11
|
+
exports.nodeHtmlParserOptions = {
|
|
12
|
+
blockTextElements: {
|
|
13
|
+
script: true,
|
|
14
|
+
noscript: true,
|
|
15
|
+
style: true,
|
|
16
|
+
code: true,
|
|
17
|
+
},
|
|
18
|
+
};
|
|
19
|
+
class HtmlParser extends source_parser_1.SourceParser {
|
|
20
|
+
constructor(source, options = exports.nodeHtmlParserOptions) {
|
|
21
|
+
super(source);
|
|
22
|
+
this.source = source;
|
|
23
|
+
this.root = node_html_parser_1.default.parse(source, options);
|
|
24
|
+
}
|
|
25
|
+
async extractValues({ query, extractor, limit }) {
|
|
26
|
+
const elements = (0, utils_1.selectManyElements)(query, this.root);
|
|
27
|
+
let items = [];
|
|
28
|
+
for (const element of elements) {
|
|
29
|
+
if (limit != undefined && items.length >= limit)
|
|
30
|
+
break;
|
|
31
|
+
const data = await extractor(element);
|
|
32
|
+
items.push(data);
|
|
33
|
+
}
|
|
34
|
+
return items;
|
|
35
|
+
}
|
|
36
|
+
async extractValue({ query, extractor, default: default_ }) {
|
|
37
|
+
let data;
|
|
38
|
+
if (query) {
|
|
39
|
+
const element = (0, utils_1.selectFirstElement)(query, this.root);
|
|
40
|
+
if (!element) {
|
|
41
|
+
if (default_ !== undefined) {
|
|
42
|
+
return default_;
|
|
43
|
+
}
|
|
44
|
+
throw new errors_1.HTMLElementNotFoundError(query);
|
|
45
|
+
}
|
|
46
|
+
data = await extractor(element);
|
|
47
|
+
}
|
|
48
|
+
else {
|
|
49
|
+
data = await extractor(this.root);
|
|
50
|
+
}
|
|
51
|
+
return data !== null && data !== void 0 ? data : default_;
|
|
52
|
+
}
|
|
53
|
+
async extractModel({ model, query }) {
|
|
54
|
+
const element = query ? (0, utils_1.selectFirstElement)(query, this.root) : this.root;
|
|
55
|
+
if (!element) {
|
|
56
|
+
throw new errors_1.HTMLElementNotFoundError(query);
|
|
57
|
+
}
|
|
58
|
+
return await model.extract(element.outerHTML);
|
|
59
|
+
}
|
|
60
|
+
async extractModels({ model, query, limit }) {
|
|
61
|
+
const elements = (0, utils_1.selectManyElements)(query, this.root);
|
|
62
|
+
let dataList = [];
|
|
63
|
+
for (const element of elements) {
|
|
64
|
+
if (limit != undefined && dataList.length >= limit)
|
|
65
|
+
break;
|
|
66
|
+
const data = await model.extract(element.outerHTML);
|
|
67
|
+
dataList.push(data);
|
|
68
|
+
}
|
|
69
|
+
return dataList;
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
exports.HtmlParser = HtmlParser;
|