@xcrap/extractor 0.3.0 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -1,4 +1,4 @@
1
- # 🕷️ Xcrap Extractor: Parsing HTML and JSON using declarative models
1
+ # 🕷️ Xcrap Extractor: Parsing HTML, JSON and Markdown using declarative models
2
2
 
3
3
  > Note: Xcrap Parser is now Xcrap Extractor
4
4
 
@@ -65,16 +65,16 @@ import { HtmlParser, extract, css } from "@xcrap/extractor"
65
65
 
66
66
  ### Data extraction with using models
67
67
 
68
- ExtractorModels are decoupled enough that you don't have to rely on using SourceParser instances, but we'll still use them:
68
+ ExtractionModels are decoupled enough that you don't have to rely on using SourceParser instances, but we'll still use them:
69
69
 
70
70
  ```ts
71
- import { HtmlParser, HtmlExtrctorModel, extract, css } from "@xcrap/extractor"
71
+ import { HtmlParser, HtmlExtractionModel, extract, css } from "@xcrap/extractor"
72
72
 
73
73
  ;(async () => {
74
74
  const html = `<html><body><h1>Heading</h1><div><p id="id">1</p><p id="name">Name</p><p class="age">23</p></div></body></html>`
75
75
  const parser = new HtmlParser(html)
76
76
 
77
- const rootModel = new HtmlExtrctorModel({
77
+ const rootModel = new HtmlExtractionModel({
78
78
  heading: {
79
79
  query: css("h1"),
80
80
  extractor: extract("innerText")
@@ -107,9 +107,9 @@ import { HtmlParser, HtmlExtrctorModel, extract, css } from "@xcrap/extractor"
107
107
 
108
108
  A SourceParser for this library is a class that deals in some way with a file type, loads that file, and may or may not have some methods to easily extract data.
109
109
 
110
- A parser has a default method called `extractWithModel` which is a wrapper that takes a `ExtractorModel` and calls the `extract()` method providing the internal `source` property.
110
+ A parser has a default method called `extractWithModel` which is a wrapper that takes a `ExtractionModel` and calls the `extract()` method providing the internal `source` property.
111
111
 
112
- ### What is a ExtractorModel?
112
+ ### What is a ExtractionModel?
113
113
 
114
114
  A Extractor Model is a class that receives a `shape` in its constructor and stores it as a property. It must have a method called `extract()` that will receive a `source`, which is the code/text that contains the information to be extracted.
115
115
 
@@ -0,0 +1,35 @@
1
+ import { HTMLElement, Options as NodeHtmlOptions } from "node-html-parser";
2
+ import { ExtractionModel } from "../interfaces/extraction-model";
3
+ import { BuildedQuery } from "../query-builders";
4
+ import { ExtractorFunction } from "./extractors";
5
+ export type HtmlExtractionModelShapeBaseValue = {
6
+ query?: BuildedQuery;
7
+ default?: string | string[] | null;
8
+ multiple?: boolean;
9
+ limit?: number;
10
+ extractor: ExtractorFunction;
11
+ };
12
+ export type HtmlExtractionModelShapeNestedValue = {
13
+ query: BuildedQuery;
14
+ limit?: number;
15
+ default?: any | any[];
16
+ multiple?: boolean;
17
+ model: ExtractionModel;
18
+ extractor?: ExtractorFunction;
19
+ };
20
+ export type HtmlExtractionModelValue = HtmlExtractionModelShapeBaseValue | HtmlExtractionModelShapeNestedValue;
21
+ export type HtmlExtractionModelShape = {
22
+ [key: string]: HtmlExtractionModelValue;
23
+ };
24
+ export type InferHtmlValue<V extends HtmlExtractionModelValue> = V extends HtmlExtractionModelShapeNestedValue ? V["multiple"] extends true ? V["model"] extends ExtractionModel<infer M> ? M[] : any : V["model"] extends ExtractionModel<infer M> ? M : any : V extends HtmlExtractionModelShapeBaseValue ? V["multiple"] extends true ? Awaited<ReturnType<V["extractor"]>>[] : Awaited<ReturnType<V["extractor"]>> : never;
25
+ export type InferHtmlShape<S extends HtmlExtractionModelShape> = {
26
+ [K in keyof S]: InferHtmlValue<S[K]>;
27
+ };
28
+ export type ParseBaseValueReturnType = (undefined | string)[] | string | null | undefined;
29
+ export declare class HtmlExtractionModel<S extends HtmlExtractionModelShape> implements ExtractionModel<InferHtmlShape<S>> {
30
+ readonly shape: S;
31
+ constructor(shape: S);
32
+ extract(source: string, options?: NodeHtmlOptions): Promise<InferHtmlShape<S>>;
33
+ protected extractBaseValue(value: HtmlExtractionModelShapeBaseValue, root: HTMLElement): Promise<ParseBaseValueReturnType>;
34
+ protected extractNestedValue(value: HtmlExtractionModelShapeNestedValue, root: HTMLElement): Promise<any>;
35
+ }
@@ -0,0 +1,74 @@
1
+ "use strict";
2
+ var __importDefault = (this && this.__importDefault) || function (mod) {
3
+ return (mod && mod.__esModule) ? mod : { "default": mod };
4
+ };
5
+ Object.defineProperty(exports, "__esModule", { value: true });
6
+ exports.HtmlExtractionModel = void 0;
7
+ const node_html_parser_1 = __importDefault(require("node-html-parser"));
8
+ const errors_1 = require("../errors");
9
+ const utils_1 = require("../utils");
10
+ const parser_1 = require("./parser");
11
+ class HtmlExtractionModel {
12
+ constructor(shape) {
13
+ this.shape = shape;
14
+ }
15
+ async extract(source, options = parser_1.nodeHtmlParserOptions) {
16
+ const root = node_html_parser_1.default.parse(source, options);
17
+ const data = {};
18
+ for (const key in this.shape) {
19
+ const value = this.shape[key];
20
+ const isNestedValue = "model" in value;
21
+ if (isNestedValue) {
22
+ data[key] = await this.extractNestedValue(value, root);
23
+ }
24
+ else {
25
+ data[key] = await this.extractBaseValue(value, root);
26
+ }
27
+ }
28
+ return data;
29
+ }
30
+ async extractBaseValue(value, root) {
31
+ if (value.multiple) {
32
+ if (!value.query) {
33
+ throw new errors_1.MultipleQueryError();
34
+ }
35
+ const elements = (0, utils_1.selectManyElements)(value.query, root);
36
+ if (value.limit !== undefined) {
37
+ elements.splice(value.limit);
38
+ }
39
+ return await Promise.all(elements.map((element) => value.extractor(element)));
40
+ }
41
+ else {
42
+ const element = value.query ? (0, utils_1.selectFirstElement)(value.query, root) : root;
43
+ if (!element) {
44
+ if (value.default === undefined) {
45
+ throw new errors_1.HTMLElementNotFoundError(value.query);
46
+ }
47
+ return value.default;
48
+ }
49
+ return await value.extractor(element);
50
+ }
51
+ }
52
+ async extractNestedValue(value, root) {
53
+ if (value.multiple) {
54
+ const elements = (0, utils_1.selectManyElements)(value.query, root);
55
+ if (value.limit !== undefined) {
56
+ elements.splice(value.limit);
57
+ }
58
+ return await Promise.all(elements.map((element) => value.model.extract(element.outerHTML)));
59
+ }
60
+ else {
61
+ const element = (0, utils_1.selectFirstElement)(value.query, root);
62
+ if (!element) {
63
+ if (value.default === undefined) {
64
+ throw new errors_1.HTMLElementNotFoundError(value.query);
65
+ }
66
+ return value.default;
67
+ }
68
+ const source = value.extractor ? (await value.extractor(element)) : element.outerHTML;
69
+ const data = await value.model.extract(source);
70
+ return data;
71
+ }
72
+ }
73
+ }
74
+ exports.HtmlExtractionModel = HtmlExtractionModel;
@@ -1,3 +1,3 @@
1
- export * from "./extractor-model";
1
+ export * from "./extraction-model";
2
2
  export * from "./parser";
3
3
  export * from "./extractors";
@@ -14,6 +14,6 @@ var __exportStar = (this && this.__exportStar) || function(m, exports) {
14
14
  for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
15
15
  };
16
16
  Object.defineProperty(exports, "__esModule", { value: true });
17
- __exportStar(require("./extractor-model"), exports);
17
+ __exportStar(require("./extraction-model"), exports);
18
18
  __exportStar(require("./parser"), exports);
19
19
  __exportStar(require("./extractors"), exports);
@@ -1,5 +1,5 @@
1
1
  import { HTMLElement, Options as NodeHtmlOptions } from "node-html-parser";
2
- import { ExtractorModel } from "../interfaces/extractor-model";
2
+ import { ExtractionModel } from "../interfaces/extraction-model";
3
3
  import { BuildedQuery } from "../query-builders";
4
4
  import { ExtractorFunction } from "./extractors";
5
5
  import { SourceParser } from "../source-parser";
@@ -15,11 +15,11 @@ export type ExtractValueOptions = {
15
15
  };
16
16
  export type ExtractModelOptions<T = any> = {
17
17
  query?: BuildedQuery;
18
- model: ExtractorModel<T>;
18
+ model: ExtractionModel<T>;
19
19
  };
20
20
  export type ExtractModelsOptions<T = any> = {
21
21
  query: BuildedQuery;
22
- model: ExtractorModel<T>;
22
+ model: ExtractionModel<T>;
23
23
  limit?: number;
24
24
  };
25
25
  export declare const nodeHtmlParserOptions: {
package/dist/index.d.ts CHANGED
@@ -2,6 +2,6 @@ export * from "./errors";
2
2
  export * from "./html";
3
3
  export * from "./json";
4
4
  export * from "./source-parser";
5
- export * from "./interfaces/extractor-model";
5
+ export * from "./interfaces/extraction-model";
6
6
  export * from "./markdown";
7
7
  export * from "./query-builders";
package/dist/index.js CHANGED
@@ -18,6 +18,6 @@ __exportStar(require("./errors"), exports);
18
18
  __exportStar(require("./html"), exports);
19
19
  __exportStar(require("./json"), exports);
20
20
  __exportStar(require("./source-parser"), exports);
21
- __exportStar(require("./interfaces/extractor-model"), exports);
21
+ __exportStar(require("./interfaces/extraction-model"), exports);
22
22
  __exportStar(require("./markdown"), exports);
23
23
  __exportStar(require("./query-builders"), exports);
@@ -0,0 +1,3 @@
1
+ export interface ExtractionModel<T = any> {
2
+ extract(source: string): Promise<T> | T;
3
+ }
@@ -0,0 +1,2 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
@@ -0,0 +1,22 @@
1
+ import { ExtractionModel } from "../interfaces/extraction-model";
2
+ export type JsonExtractionModelShapeValue = {
3
+ query: string;
4
+ default?: any;
5
+ model?: ExtractionModel;
6
+ multiple?: boolean;
7
+ limit?: number;
8
+ };
9
+ export type JsonExtractionModelShape = {
10
+ [key: string]: JsonExtractionModelShapeValue;
11
+ };
12
+ export type InferJsonValue<V extends JsonExtractionModelShapeValue> = V["model"] extends ExtractionModel<infer M> ? (V["multiple"] extends true ? M[] : M) : any;
13
+ export type InferJsonShape<S extends JsonExtractionModelShape> = {
14
+ [K in keyof S]: InferJsonValue<S[K]>;
15
+ };
16
+ export declare class JsonExtractionModel<S extends JsonExtractionModelShape> implements ExtractionModel<InferJsonShape<S>> {
17
+ readonly shape: S;
18
+ constructor(shape: S);
19
+ extract(source: string): Promise<InferJsonShape<S>>;
20
+ extractValue(value: JsonExtractionModelShapeValue, root: any): any;
21
+ extractNestedValue(value: JsonExtractionModelShapeValue, root: any): Promise<any>;
22
+ }
@@ -0,0 +1,97 @@
1
+ "use strict";
2
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
+ if (k2 === undefined) k2 = k;
4
+ var desc = Object.getOwnPropertyDescriptor(m, k);
5
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
+ desc = { enumerable: true, get: function() { return m[k]; } };
7
+ }
8
+ Object.defineProperty(o, k2, desc);
9
+ }) : (function(o, m, k, k2) {
10
+ if (k2 === undefined) k2 = k;
11
+ o[k2] = m[k];
12
+ }));
13
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
14
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
15
+ }) : function(o, v) {
16
+ o["default"] = v;
17
+ });
18
+ var __importStar = (this && this.__importStar) || (function () {
19
+ var ownKeys = function(o) {
20
+ ownKeys = Object.getOwnPropertyNames || function (o) {
21
+ var ar = [];
22
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
23
+ return ar;
24
+ };
25
+ return ownKeys(o);
26
+ };
27
+ return function (mod) {
28
+ if (mod && mod.__esModule) return mod;
29
+ var result = {};
30
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
31
+ __setModuleDefault(result, mod);
32
+ return result;
33
+ };
34
+ })();
35
+ Object.defineProperty(exports, "__esModule", { value: true });
36
+ exports.JsonExtractionModel = void 0;
37
+ const jmespath = __importStar(require("jmespath"));
38
+ class JsonExtractionModel {
39
+ constructor(shape) {
40
+ this.shape = shape;
41
+ }
42
+ async extract(source) {
43
+ const root = JSON.parse(source);
44
+ const data = {};
45
+ for (const key in this.shape) {
46
+ const value = this.shape[key];
47
+ const isNestedValue = "model" in value;
48
+ if (isNestedValue) {
49
+ data[key] = await this.extractNestedValue(value, root);
50
+ }
51
+ else {
52
+ data[key] = this.extractValue(value, root);
53
+ }
54
+ }
55
+ return data;
56
+ }
57
+ extractValue(value, root) {
58
+ const extractedData = jmespath.search(root, value.query);
59
+ if (extractedData === null && value.default !== undefined) {
60
+ return value.default;
61
+ }
62
+ return extractedData;
63
+ }
64
+ async extractNestedValue(value, root) {
65
+ const extractedData = jmespath.search(root, value.query);
66
+ const model = value.model;
67
+ const modelIsJsonExtractor = model.constructor.name === JsonExtractionModel.name;
68
+ if (extractedData === null && value.default !== undefined) {
69
+ return value.default;
70
+ }
71
+ if (value.multiple) {
72
+ if (!Array.isArray(extractedData)) {
73
+ throw new Error(`Expected an array for multiple values, but got ${typeof extractedData}`);
74
+ }
75
+ if (value.limit !== undefined) {
76
+ extractedData.splice(value.limit);
77
+ }
78
+ if (!modelIsJsonExtractor) {
79
+ if (extractedData.some((item) => typeof item !== "string")) {
80
+ throw new Error(`Expected an array of strings for model parsing, but got ${typeof extractedData[0]}`);
81
+ }
82
+ return await Promise.all(extractedData.map((item) => model.extract(JSON.stringify(item))));
83
+ }
84
+ }
85
+ else {
86
+ if (!modelIsJsonExtractor && typeof extractedData !== "string") {
87
+ throw new Error(`Expected a string for model parsing, but got ${typeof extractedData}`);
88
+ }
89
+ if (!modelIsJsonExtractor) {
90
+ return await model.extract(extractedData);
91
+ }
92
+ return await model.extract(JSON.stringify(extractedData));
93
+ }
94
+ return extractedData;
95
+ }
96
+ }
97
+ exports.JsonExtractionModel = JsonExtractionModel;
@@ -1,2 +1,2 @@
1
- export * from "./extractor-model";
1
+ export * from "./extraction-model";
2
2
  export * from "./parser";
@@ -14,5 +14,5 @@ var __exportStar = (this && this.__exportStar) || function(m, exports) {
14
14
  for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
15
15
  };
16
16
  Object.defineProperty(exports, "__esModule", { value: true });
17
- __exportStar(require("./extractor-model"), exports);
17
+ __exportStar(require("./extraction-model"), exports);
18
18
  __exportStar(require("./parser"), exports);
@@ -0,0 +1,4 @@
1
+ import { HtmlExtractionModel, HtmlExtractionModelShape, InferHtmlShape } from "../html";
2
+ export declare class MarkdownExtractionModel<S extends HtmlExtractionModelShape> extends HtmlExtractionModel<S> {
3
+ extract(source: string): Promise<InferHtmlShape<S>>;
4
+ }
@@ -0,0 +1,17 @@
1
+ "use strict";
2
+ var __importDefault = (this && this.__importDefault) || function (mod) {
3
+ return (mod && mod.__esModule) ? mod : { "default": mod };
4
+ };
5
+ Object.defineProperty(exports, "__esModule", { value: true });
6
+ exports.MarkdownExtractionModel = void 0;
7
+ const he_1 = __importDefault(require("he"));
8
+ const html_1 = require("../html");
9
+ const constants_1 = require("./constants");
10
+ class MarkdownExtractionModel extends html_1.HtmlExtractionModel {
11
+ async extract(source) {
12
+ const htmlSource = constants_1.markdownIt.render(source);
13
+ const decodedSource = he_1.default.decode(htmlSource).replace(/\u00A0/g, " ");
14
+ return await super.extract(`<!DOCTYPE html><html lang="en"><head><title>Markdown Document</title></head><body>${decodedSource}</body></html>`);
15
+ }
16
+ }
17
+ exports.MarkdownExtractionModel = MarkdownExtractionModel;
@@ -1,2 +1,2 @@
1
1
  export * from "./parser";
2
- export * from "./extractor-model";
2
+ export * from "./extraction-model";
@@ -15,4 +15,4 @@ var __exportStar = (this && this.__exportStar) || function(m, exports) {
15
15
  };
16
16
  Object.defineProperty(exports, "__esModule", { value: true });
17
17
  __exportStar(require("./parser"), exports);
18
- __exportStar(require("./extractor-model"), exports);
18
+ __exportStar(require("./extraction-model"), exports);
@@ -1,6 +1,6 @@
1
1
  import { Abortable } from "node:events";
2
2
  import { OpenMode } from "node:fs";
3
- import { ExtractorModel } from "./interfaces/extractor-model";
3
+ import { ExtractionModel } from "./interfaces/extraction-model";
4
4
  export type SourceParserLoadFileOptions = ({
5
5
  encoding?: BufferEncoding | null;
6
6
  flag?: OpenMode;
@@ -8,6 +8,6 @@ export type SourceParserLoadFileOptions = ({
8
8
  export declare class SourceParser {
9
9
  readonly source: string;
10
10
  constructor(source: string);
11
- extractWithModel<T>(extractorModel: ExtractorModel<T>): Promise<T>;
11
+ extractWithModel<T>(extractionModel: ExtractionModel<T>): Promise<T>;
12
12
  static loadFile<T extends typeof SourceParser>(this: T, path: string, options?: SourceParserLoadFileOptions): Promise<InstanceType<T>>;
13
13
  }
@@ -9,8 +9,8 @@ class SourceParser {
9
9
  constructor(source) {
10
10
  this.source = source;
11
11
  }
12
- async extractWithModel(extractorModel) {
13
- return await extractorModel.extract(this.source);
12
+ async extractWithModel(extractionModel) {
13
+ return await extractionModel.extract(this.source);
14
14
  }
15
15
  static async loadFile(path, options = {
16
16
  encoding: "utf-8",
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@xcrap/extractor",
3
- "version": "0.3.0",
3
+ "version": "0.3.2",
4
4
  "description": "Xcrap Extractor is a package of the Xcrap framework, it was developed to take care of the data extraction part of text files (currently supporting only HTML, JSON and Markdown) using declarative models.",
5
5
  "keywords": [
6
6
  "xcrap",
@@ -40,7 +40,7 @@
40
40
  "@types/he": "^1.2.3",
41
41
  "@types/jest": "^29.5.14",
42
42
  "@types/jmespath": "^0.15.2",
43
- "@types/node": "^22.13.14",
43
+ "@types/node": "^22.19.11",
44
44
  "@types/xmldom": "^0.1.34",
45
45
  "eslint": "^10.0.0",
46
46
  "eslint-config-prettier": "^10.1.8",
@@ -48,16 +48,16 @@
48
48
  "jest": "^29.7.0",
49
49
  "prettier": "^3.8.1",
50
50
  "prettier-plugin-sort-imports": "^1.8.10",
51
- "ts-jest": "^29.3.0",
51
+ "ts-jest": "^29.4.6",
52
52
  "ts-node": "^10.9.2",
53
- "typescript": "^5.8.2"
53
+ "typescript": "^5.9.3"
54
54
  },
55
55
  "dependencies": {
56
56
  "he": "^1.2.0",
57
57
  "jmespath": "^0.16.0",
58
- "markdown-it": "^14.1.0",
58
+ "markdown-it": "^14.1.1",
59
59
  "markdown-it-anchor": "^9.2.0",
60
- "node-html-parser": "^7.0.1",
60
+ "node-html-parser": "^7.0.2",
61
61
  "xmldom": "^0.6.0",
62
62
  "xpath": "^0.0.34"
63
63
  }