@xcrap/extractor 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +131 -0
- package/dist/errors.d.ts +13 -0
- package/dist/errors.js +29 -0
- package/dist/html/extractor-model.d.ts +34 -0
- package/dist/html/extractor-model.js +71 -0
- package/dist/html/extractors.d.ts +61 -0
- package/dist/html/extractors.js +145 -0
- package/dist/html/index.d.ts +3 -0
- package/dist/html/index.js +19 -0
- package/dist/html/parser.d.ts +41 -0
- package/dist/html/parser.js +72 -0
- package/dist/index.d.ts +7 -0
- package/dist/index.js +23 -0
- package/dist/interfaces/extractor-model.d.ts +3 -0
- package/dist/interfaces/extractor-model.js +2 -0
- package/dist/json/extractor-model.d.ts +22 -0
- package/dist/json/extractor-model.js +97 -0
- package/dist/json/index.d.ts +2 -0
- package/dist/json/index.js +18 -0
- package/dist/json/parser.d.ts +6 -0
- package/dist/json/parser.js +48 -0
- package/dist/markdown/constants.d.ts +2 -0
- package/dist/markdown/constants.js +11 -0
- package/dist/markdown/extractor-model.d.ts +4 -0
- package/dist/markdown/extractor-model.js +17 -0
- package/dist/markdown/index.d.ts +2 -0
- package/dist/markdown/index.js +18 -0
- package/dist/markdown/parser.d.ts +5 -0
- package/dist/markdown/parser.js +18 -0
- package/dist/query-builders/html.d.ts +6 -0
- package/dist/query-builders/html.js +16 -0
- package/dist/query-builders/index.d.ts +1 -0
- package/dist/query-builders/index.js +17 -0
- package/dist/source-parser.d.ts +13 -0
- package/dist/source-parser.js +22 -0
- package/dist/utils/index.d.ts +3 -0
- package/dist/utils/index.js +19 -0
- package/dist/utils/select-first-element.d.ts +3 -0
- package/dist/utils/select-first-element.js +60 -0
- package/dist/utils/select-many-elements.d.ts +3 -0
- package/dist/utils/select-many-elements.js +57 -0
- package/dist/utils/slugify.d.ts +1 -0
- package/dist/utils/slugify.js +10 -0
- package/package.json +64 -0
package/dist/index.js
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __exportStar = (this && this.__exportStar) || function(m, exports) {
|
|
14
|
+
for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
|
|
15
|
+
};
|
|
16
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
17
|
+
__exportStar(require("./errors"), exports);
|
|
18
|
+
__exportStar(require("./html"), exports);
|
|
19
|
+
__exportStar(require("./json"), exports);
|
|
20
|
+
__exportStar(require("./source-parser"), exports);
|
|
21
|
+
__exportStar(require("./interfaces/extractor-model"), exports);
|
|
22
|
+
__exportStar(require("./markdown"), exports);
|
|
23
|
+
__exportStar(require("./query-builders"), exports);
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
import { ExtractorModel } from "../interfaces/extractor-model";
|
|
2
|
+
export type JsonExtractorModelShapeValue = {
|
|
3
|
+
query: string;
|
|
4
|
+
default?: any;
|
|
5
|
+
model?: ExtractorModel;
|
|
6
|
+
multiple?: boolean;
|
|
7
|
+
limit?: number;
|
|
8
|
+
};
|
|
9
|
+
export type JsonExtractorModelShape = {
|
|
10
|
+
[key: string]: JsonExtractorModelShapeValue;
|
|
11
|
+
};
|
|
12
|
+
export type InferJsonValue<V extends JsonExtractorModelShapeValue> = V["model"] extends ExtractorModel<infer M> ? (V["multiple"] extends true ? M[] : M) : any;
|
|
13
|
+
export type InferJsonShape<S extends JsonExtractorModelShape> = {
|
|
14
|
+
[K in keyof S]: InferJsonValue<S[K]>;
|
|
15
|
+
};
|
|
16
|
+
export declare class JsonExtractorModel<S extends JsonExtractorModelShape> implements ExtractorModel<InferJsonShape<S>> {
|
|
17
|
+
readonly shape: S;
|
|
18
|
+
constructor(shape: S);
|
|
19
|
+
extract(source: string): Promise<InferJsonShape<S>>;
|
|
20
|
+
extractValue(value: JsonExtractorModelShapeValue, root: any): any;
|
|
21
|
+
extractNestedValue(value: JsonExtractorModelShapeValue, root: any): Promise<any>;
|
|
22
|
+
}
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
+
}) : function(o, v) {
|
|
16
|
+
o["default"] = v;
|
|
17
|
+
});
|
|
18
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
19
|
+
var ownKeys = function(o) {
|
|
20
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
21
|
+
var ar = [];
|
|
22
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
23
|
+
return ar;
|
|
24
|
+
};
|
|
25
|
+
return ownKeys(o);
|
|
26
|
+
};
|
|
27
|
+
return function (mod) {
|
|
28
|
+
if (mod && mod.__esModule) return mod;
|
|
29
|
+
var result = {};
|
|
30
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
31
|
+
__setModuleDefault(result, mod);
|
|
32
|
+
return result;
|
|
33
|
+
};
|
|
34
|
+
})();
|
|
35
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
36
|
+
exports.JsonExtractorModel = void 0;
|
|
37
|
+
const jmespath = __importStar(require("jmespath"));
|
|
38
|
+
class JsonExtractorModel {
|
|
39
|
+
constructor(shape) {
|
|
40
|
+
this.shape = shape;
|
|
41
|
+
}
|
|
42
|
+
async extract(source) {
|
|
43
|
+
const root = JSON.parse(source);
|
|
44
|
+
const data = {};
|
|
45
|
+
for (const key in this.shape) {
|
|
46
|
+
const value = this.shape[key];
|
|
47
|
+
const isNestedValue = "model" in value;
|
|
48
|
+
if (isNestedValue) {
|
|
49
|
+
data[key] = await this.extractNestedValue(value, root);
|
|
50
|
+
}
|
|
51
|
+
else {
|
|
52
|
+
data[key] = this.extractValue(value, root);
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
return data;
|
|
56
|
+
}
|
|
57
|
+
extractValue(value, root) {
|
|
58
|
+
const extractedData = jmespath.search(root, value.query);
|
|
59
|
+
if (extractedData === null && value.default !== undefined) {
|
|
60
|
+
return value.default;
|
|
61
|
+
}
|
|
62
|
+
return extractedData;
|
|
63
|
+
}
|
|
64
|
+
async extractNestedValue(value, root) {
|
|
65
|
+
const extractedData = jmespath.search(root, value.query);
|
|
66
|
+
const model = value.model;
|
|
67
|
+
const modelIsJsonExtractor = model.constructor.name === JsonExtractorModel.name;
|
|
68
|
+
if (extractedData === null && value.default !== undefined) {
|
|
69
|
+
return value.default;
|
|
70
|
+
}
|
|
71
|
+
if (value.multiple) {
|
|
72
|
+
if (!Array.isArray(extractedData)) {
|
|
73
|
+
throw new Error(`Expected an array for multiple values, but got ${typeof extractedData}`);
|
|
74
|
+
}
|
|
75
|
+
if (value.limit !== undefined) {
|
|
76
|
+
extractedData.splice(value.limit);
|
|
77
|
+
}
|
|
78
|
+
if (!modelIsJsonExtractor) {
|
|
79
|
+
if (extractedData.some((item) => typeof item !== "string")) {
|
|
80
|
+
throw new Error(`Expected an array of strings for model parsing, but got ${typeof extractedData[0]}`);
|
|
81
|
+
}
|
|
82
|
+
return await Promise.all(extractedData.map((item) => model.extract(JSON.stringify(item))));
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
else {
|
|
86
|
+
if (!modelIsJsonExtractor && typeof extractedData !== "string") {
|
|
87
|
+
throw new Error(`Expected a string for model parsing, but got ${typeof extractedData}`);
|
|
88
|
+
}
|
|
89
|
+
if (!modelIsJsonExtractor) {
|
|
90
|
+
return await model.extract(extractedData);
|
|
91
|
+
}
|
|
92
|
+
return await model.extract(JSON.stringify(extractedData));
|
|
93
|
+
}
|
|
94
|
+
return extractedData;
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
exports.JsonExtractorModel = JsonExtractorModel;
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __exportStar = (this && this.__exportStar) || function(m, exports) {
|
|
14
|
+
for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
|
|
15
|
+
};
|
|
16
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
17
|
+
__exportStar(require("./extractor-model"), exports);
|
|
18
|
+
__exportStar(require("./parser"), exports);
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
+
}) : function(o, v) {
|
|
16
|
+
o["default"] = v;
|
|
17
|
+
});
|
|
18
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
19
|
+
var ownKeys = function(o) {
|
|
20
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
21
|
+
var ar = [];
|
|
22
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
23
|
+
return ar;
|
|
24
|
+
};
|
|
25
|
+
return ownKeys(o);
|
|
26
|
+
};
|
|
27
|
+
return function (mod) {
|
|
28
|
+
if (mod && mod.__esModule) return mod;
|
|
29
|
+
var result = {};
|
|
30
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
31
|
+
__setModuleDefault(result, mod);
|
|
32
|
+
return result;
|
|
33
|
+
};
|
|
34
|
+
})();
|
|
35
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
36
|
+
exports.JsonParser = void 0;
|
|
37
|
+
const jmespath = __importStar(require("jmespath"));
|
|
38
|
+
const source_parser_1 = require("../source-parser");
|
|
39
|
+
class JsonParser extends source_parser_1.SourceParser {
|
|
40
|
+
constructor(source) {
|
|
41
|
+
super(source);
|
|
42
|
+
this.data = JSON.parse(source);
|
|
43
|
+
}
|
|
44
|
+
extract(query) {
|
|
45
|
+
return jmespath.search(this.data, query);
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
exports.JsonParser = JsonParser;
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
3
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
4
|
+
};
|
|
5
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
+
exports.markdownIt = void 0;
|
|
7
|
+
const markdown_it_anchor_1 = __importDefault(require("markdown-it-anchor"));
|
|
8
|
+
const markdown_it_1 = __importDefault(require("markdown-it"));
|
|
9
|
+
exports.markdownIt = new markdown_it_1.default({
|
|
10
|
+
html: true,
|
|
11
|
+
}).use(markdown_it_anchor_1.default);
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
3
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
4
|
+
};
|
|
5
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
+
exports.MarkdownExtractorModel = void 0;
|
|
7
|
+
const he_1 = __importDefault(require("he"));
|
|
8
|
+
const html_1 = require("../html");
|
|
9
|
+
const constants_1 = require("./constants");
|
|
10
|
+
class MarkdownExtractorModel extends html_1.HtmlExtrctorModel {
|
|
11
|
+
async extract(source) {
|
|
12
|
+
const htmlSource = constants_1.markdownIt.render(source);
|
|
13
|
+
const decodedSource = he_1.default.decode(htmlSource).replace(/\u00A0/g, " ");
|
|
14
|
+
return await super.extract(`<!DOCTYPE html><html lang="en"><head><title>Markdown Document</title></head><body>${decodedSource}</body></html>`);
|
|
15
|
+
}
|
|
16
|
+
}
|
|
17
|
+
exports.MarkdownExtractorModel = MarkdownExtractorModel;
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __exportStar = (this && this.__exportStar) || function(m, exports) {
|
|
14
|
+
for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
|
|
15
|
+
};
|
|
16
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
17
|
+
__exportStar(require("./parser"), exports);
|
|
18
|
+
__exportStar(require("./extractor-model"), exports);
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
3
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
4
|
+
};
|
|
5
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
+
exports.MarkdownParser = void 0;
|
|
7
|
+
const he_1 = __importDefault(require("he"));
|
|
8
|
+
const constants_1 = require("./constants");
|
|
9
|
+
const html_1 = require("../html");
|
|
10
|
+
class MarkdownParser extends html_1.HtmlParser {
|
|
11
|
+
constructor(source) {
|
|
12
|
+
const htmlSource = constants_1.markdownIt.render(source);
|
|
13
|
+
const decodedSource = he_1.default.decode(htmlSource).replace(/\u00A0/g, " ");
|
|
14
|
+
super(`<!DOCTYPE html><html lang="en"><head><title>Markdown Document</title></head><body>${decodedSource}</body></html>`);
|
|
15
|
+
this.markdownSource = source;
|
|
16
|
+
}
|
|
17
|
+
}
|
|
18
|
+
exports.MarkdownParser = MarkdownParser;
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.css = css;
|
|
4
|
+
exports.xpath = xpath;
|
|
5
|
+
function css(query) {
|
|
6
|
+
return {
|
|
7
|
+
value: query,
|
|
8
|
+
type: "css",
|
|
9
|
+
};
|
|
10
|
+
}
|
|
11
|
+
function xpath(query) {
|
|
12
|
+
return {
|
|
13
|
+
value: query,
|
|
14
|
+
type: "xpath",
|
|
15
|
+
};
|
|
16
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export * from "./html";
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __exportStar = (this && this.__exportStar) || function(m, exports) {
|
|
14
|
+
for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
|
|
15
|
+
};
|
|
16
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
17
|
+
__exportStar(require("./html"), exports);
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import { Abortable } from "node:events";
|
|
2
|
+
import { OpenMode } from "node:fs";
|
|
3
|
+
import { ExtractorModel } from "./interfaces/extractor-model";
|
|
4
|
+
export type SourceParserLoadFileOptions = ({
|
|
5
|
+
encoding?: BufferEncoding | null;
|
|
6
|
+
flag?: OpenMode;
|
|
7
|
+
} & Abortable) | null;
|
|
8
|
+
export declare class SourceParser {
|
|
9
|
+
readonly source: string;
|
|
10
|
+
constructor(source: string);
|
|
11
|
+
extractWithModel<T>(extractorModel: ExtractorModel<T>): Promise<T>;
|
|
12
|
+
static loadFile<T extends typeof SourceParser>(this: T, path: string, options?: SourceParserLoadFileOptions): Promise<InstanceType<T>>;
|
|
13
|
+
}
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
3
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
4
|
+
};
|
|
5
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
+
exports.SourceParser = void 0;
|
|
7
|
+
const node_fs_1 = __importDefault(require("node:fs"));
|
|
8
|
+
class SourceParser {
|
|
9
|
+
constructor(source) {
|
|
10
|
+
this.source = source;
|
|
11
|
+
}
|
|
12
|
+
async extractWithModel(extractorModel) {
|
|
13
|
+
return await extractorModel.extract(this.source);
|
|
14
|
+
}
|
|
15
|
+
static async loadFile(path, options = {
|
|
16
|
+
encoding: "utf-8",
|
|
17
|
+
}) {
|
|
18
|
+
const fileContent = await node_fs_1.default.promises.readFile(path, options);
|
|
19
|
+
return Reflect.construct(this, [fileContent.toString()]);
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
exports.SourceParser = SourceParser;
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __exportStar = (this && this.__exportStar) || function(m, exports) {
|
|
14
|
+
for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
|
|
15
|
+
};
|
|
16
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
17
|
+
__exportStar(require("./select-first-element"), exports);
|
|
18
|
+
__exportStar(require("./select-many-elements"), exports);
|
|
19
|
+
__exportStar(require("./slugify"), exports);
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
+
}) : function(o, v) {
|
|
16
|
+
o["default"] = v;
|
|
17
|
+
});
|
|
18
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
19
|
+
var ownKeys = function(o) {
|
|
20
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
21
|
+
var ar = [];
|
|
22
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
23
|
+
return ar;
|
|
24
|
+
};
|
|
25
|
+
return ownKeys(o);
|
|
26
|
+
};
|
|
27
|
+
return function (mod) {
|
|
28
|
+
if (mod && mod.__esModule) return mod;
|
|
29
|
+
var result = {};
|
|
30
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
31
|
+
__setModuleDefault(result, mod);
|
|
32
|
+
return result;
|
|
33
|
+
};
|
|
34
|
+
})();
|
|
35
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
36
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
37
|
+
};
|
|
38
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
39
|
+
exports.selectFirstElement = selectFirstElement;
|
|
40
|
+
const xmldom_1 = require("xmldom");
|
|
41
|
+
const node_html_parser_1 = __importDefault(require("node-html-parser"));
|
|
42
|
+
const xpathLib = __importStar(require("xpath"));
|
|
43
|
+
function selectFirstElement(query, root) {
|
|
44
|
+
if (query.type === "css") {
|
|
45
|
+
return root.querySelector(query.value);
|
|
46
|
+
}
|
|
47
|
+
else {
|
|
48
|
+
const doc = new xmldom_1.DOMParser({
|
|
49
|
+
locator: {},
|
|
50
|
+
errorHandler: {
|
|
51
|
+
warning: () => { },
|
|
52
|
+
},
|
|
53
|
+
}).parseFromString(root.toString());
|
|
54
|
+
const elements = xpathLib.select(query.value, doc);
|
|
55
|
+
if (!elements || elements.length === 0) {
|
|
56
|
+
return null;
|
|
57
|
+
}
|
|
58
|
+
return node_html_parser_1.default.parse(elements[0].toString()).childNodes[0];
|
|
59
|
+
}
|
|
60
|
+
}
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
+
}) : function(o, v) {
|
|
16
|
+
o["default"] = v;
|
|
17
|
+
});
|
|
18
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
19
|
+
var ownKeys = function(o) {
|
|
20
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
21
|
+
var ar = [];
|
|
22
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
23
|
+
return ar;
|
|
24
|
+
};
|
|
25
|
+
return ownKeys(o);
|
|
26
|
+
};
|
|
27
|
+
return function (mod) {
|
|
28
|
+
if (mod && mod.__esModule) return mod;
|
|
29
|
+
var result = {};
|
|
30
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
31
|
+
__setModuleDefault(result, mod);
|
|
32
|
+
return result;
|
|
33
|
+
};
|
|
34
|
+
})();
|
|
35
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
36
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
37
|
+
};
|
|
38
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
39
|
+
exports.selectManyElements = selectManyElements;
|
|
40
|
+
const xmldom_1 = require("xmldom");
|
|
41
|
+
const node_html_parser_1 = __importDefault(require("node-html-parser"));
|
|
42
|
+
const xpathLib = __importStar(require("xpath"));
|
|
43
|
+
function selectManyElements(query, root) {
|
|
44
|
+
if (query.type === "css") {
|
|
45
|
+
return root.querySelectorAll(query.value);
|
|
46
|
+
}
|
|
47
|
+
else {
|
|
48
|
+
const doc = new xmldom_1.DOMParser({
|
|
49
|
+
locator: {},
|
|
50
|
+
errorHandler: {
|
|
51
|
+
warning: () => { },
|
|
52
|
+
},
|
|
53
|
+
}).parseFromString(root.toString());
|
|
54
|
+
const elements = xpathLib.select(query.value, doc);
|
|
55
|
+
return elements.map((element) => node_html_parser_1.default.parse(element.toString()).childNodes[0]);
|
|
56
|
+
}
|
|
57
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export declare function slugify(text: string): string;
|
package/package.json
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@xcrap/extractor",
|
|
3
|
+
"version": "0.3.0",
|
|
4
|
+
"description": "Xcrap Extractor is a package of the Xcrap framework, it was developed to take care of the data extraction part of text files (currently supporting only HTML, JSON and Markdown) using declarative models.",
|
|
5
|
+
"keywords": [
|
|
6
|
+
"xcrap",
|
|
7
|
+
"extractor",
|
|
8
|
+
"parser",
|
|
9
|
+
"web scraping",
|
|
10
|
+
"node-html-parser",
|
|
11
|
+
"parsel",
|
|
12
|
+
"scrapy"
|
|
13
|
+
],
|
|
14
|
+
"homepage": "https://github.com/Xcrap-Cloud/parser#readme",
|
|
15
|
+
"bugs": {
|
|
16
|
+
"url": "https://github.com/Xcrap-Cloud/parser/issues"
|
|
17
|
+
},
|
|
18
|
+
"repository": {
|
|
19
|
+
"type": "git",
|
|
20
|
+
"url": "git+https://github.com/Xcrap-Cloud/parser.git"
|
|
21
|
+
},
|
|
22
|
+
"license": "MIT",
|
|
23
|
+
"author": "Marcuth",
|
|
24
|
+
"main": "./dist/index.js",
|
|
25
|
+
"module": "./dist/index.js",
|
|
26
|
+
"types": "./dist/index.d.ts",
|
|
27
|
+
"files": [
|
|
28
|
+
"dist/*",
|
|
29
|
+
"!/**/__tests__"
|
|
30
|
+
],
|
|
31
|
+
"publishConfig": {
|
|
32
|
+
"access": "public"
|
|
33
|
+
},
|
|
34
|
+
"scripts": {
|
|
35
|
+
"build": "tsc",
|
|
36
|
+
"test": "jest",
|
|
37
|
+
"format": "prettier --write \"src/**/*.ts\" \"__tests__/**/*.ts\""
|
|
38
|
+
},
|
|
39
|
+
"devDependencies": {
|
|
40
|
+
"@types/he": "^1.2.3",
|
|
41
|
+
"@types/jest": "^29.5.14",
|
|
42
|
+
"@types/jmespath": "^0.15.2",
|
|
43
|
+
"@types/node": "^22.13.14",
|
|
44
|
+
"@types/xmldom": "^0.1.34",
|
|
45
|
+
"eslint": "^10.0.0",
|
|
46
|
+
"eslint-config-prettier": "^10.1.8",
|
|
47
|
+
"eslint-plugin-prettier": "^5.5.5",
|
|
48
|
+
"jest": "^29.7.0",
|
|
49
|
+
"prettier": "^3.8.1",
|
|
50
|
+
"prettier-plugin-sort-imports": "^1.8.10",
|
|
51
|
+
"ts-jest": "^29.3.0",
|
|
52
|
+
"ts-node": "^10.9.2",
|
|
53
|
+
"typescript": "^5.8.2"
|
|
54
|
+
},
|
|
55
|
+
"dependencies": {
|
|
56
|
+
"he": "^1.2.0",
|
|
57
|
+
"jmespath": "^0.16.0",
|
|
58
|
+
"markdown-it": "^14.1.0",
|
|
59
|
+
"markdown-it-anchor": "^9.2.0",
|
|
60
|
+
"node-html-parser": "^7.0.1",
|
|
61
|
+
"xmldom": "^0.6.0",
|
|
62
|
+
"xpath": "^0.0.34"
|
|
63
|
+
}
|
|
64
|
+
}
|