apify-schema-tools 2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.cspell/custom-dictionary.txt +4 -0
- package/.husky/pre-commit +33 -0
- package/.node-version +1 -0
- package/CHANGELOG.md +88 -0
- package/LICENSE +201 -0
- package/README.md +312 -0
- package/biome.json +31 -0
- package/dist/apify-schema-tools.d.ts +3 -0
- package/dist/apify-schema-tools.d.ts.map +1 -0
- package/dist/apify-schema-tools.js +197 -0
- package/dist/apify-schema-tools.js.map +1 -0
- package/dist/apify.d.ts +11 -0
- package/dist/apify.d.ts.map +1 -0
- package/dist/apify.js +107 -0
- package/dist/apify.js.map +1 -0
- package/dist/configuration.d.ts +43 -0
- package/dist/configuration.d.ts.map +1 -0
- package/dist/configuration.js +87 -0
- package/dist/configuration.js.map +1 -0
- package/dist/filesystem.d.ts +8 -0
- package/dist/filesystem.d.ts.map +1 -0
- package/dist/filesystem.js +16 -0
- package/dist/filesystem.js.map +1 -0
- package/dist/json-schemas.d.ts +34 -0
- package/dist/json-schemas.d.ts.map +1 -0
- package/dist/json-schemas.js +185 -0
- package/dist/json-schemas.js.map +1 -0
- package/dist/typescript.d.ts +26 -0
- package/dist/typescript.d.ts.map +1 -0
- package/dist/typescript.js +316 -0
- package/dist/typescript.js.map +1 -0
- package/package.json +60 -0
- package/samples/all-defaults/.actor/actor.json +15 -0
- package/samples/all-defaults/.actor/dataset_schema.json +32 -0
- package/samples/all-defaults/.actor/input_schema.json +53 -0
- package/samples/all-defaults/src/generated/dataset.ts +24 -0
- package/samples/all-defaults/src/generated/input-utils.ts +60 -0
- package/samples/all-defaults/src/generated/input.ts +42 -0
- package/samples/all-defaults/src-schemas/dataset-item.json +28 -0
- package/samples/all-defaults/src-schemas/input.json +73 -0
- package/samples/deep-merged-schemas/.actor/actor.json +15 -0
- package/samples/deep-merged-schemas/.actor/dataset_schema.json +37 -0
- package/samples/deep-merged-schemas/.actor/input_schema.json +61 -0
- package/samples/deep-merged-schemas/add-schemas/dataset-item.json +10 -0
- package/samples/deep-merged-schemas/add-schemas/input.json +33 -0
- package/samples/deep-merged-schemas/src/generated/dataset.ts +28 -0
- package/samples/deep-merged-schemas/src/generated/input-utils.ts +66 -0
- package/samples/deep-merged-schemas/src/generated/input.ts +47 -0
- package/samples/deep-merged-schemas/src-schemas/dataset-item.json +28 -0
- package/samples/deep-merged-schemas/src-schemas/input.json +73 -0
- package/samples/merged-schemas/.actor/actor.json +15 -0
- package/samples/merged-schemas/.actor/dataset_schema.json +37 -0
- package/samples/merged-schemas/.actor/input_schema.json +58 -0
- package/samples/merged-schemas/add-schemas/dataset-item.json +10 -0
- package/samples/merged-schemas/add-schemas/input.json +33 -0
- package/samples/merged-schemas/src/generated/dataset.ts +28 -0
- package/samples/merged-schemas/src/generated/input-utils.ts +57 -0
- package/samples/merged-schemas/src/generated/input.ts +42 -0
- package/samples/merged-schemas/src-schemas/dataset-item.json +28 -0
- package/samples/merged-schemas/src-schemas/input.json +73 -0
- package/samples/package-json-config/.actor/actor.json +15 -0
- package/samples/package-json-config/.actor/dataset_schema.json +32 -0
- package/samples/package-json-config/.actor/input_schema.json +53 -0
- package/samples/package-json-config/custom-src-schemas/dataset-item.json +28 -0
- package/samples/package-json-config/custom-src-schemas/input.json +73 -0
- package/samples/package-json-config/package.json +11 -0
- package/samples/package-json-config/src/custom-generated/dataset.ts +24 -0
- package/samples/package-json-config/src/custom-generated/input-utils.ts +60 -0
- package/samples/package-json-config/src/custom-generated/input.ts +42 -0
- package/src/apify-schema-tools.ts +302 -0
- package/src/apify.ts +124 -0
- package/src/configuration.ts +110 -0
- package/src/filesystem.ts +18 -0
- package/src/json-schemas.ts +252 -0
- package/src/typescript.ts +381 -0
- package/test/apify-schema-tools.test.ts +2064 -0
- package/test/apify.test.ts +28 -0
- package/test/common.ts +19 -0
- package/test/configuration.test.ts +642 -0
- package/test/json-schemas.test.ts +587 -0
- package/test/typescript.test.ts +817 -0
- package/tsconfig.json +18 -0
- package/update-samples.sh +27 -0
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
{
|
|
2
|
+
"title": "Input schema for Web Scraper",
|
|
3
|
+
"description": "startUrls and searchTerm are required.",
|
|
4
|
+
"type": "object",
|
|
5
|
+
"schemaVersion": 1,
|
|
6
|
+
"properties": {
|
|
7
|
+
"startUrls": {
|
|
8
|
+
"position": 10,
|
|
9
|
+
"type": "array",
|
|
10
|
+
"title": "Start URLs",
|
|
11
|
+
"description": "List of URLs to scrape",
|
|
12
|
+
"default": [],
|
|
13
|
+
"editor": "requestListSources",
|
|
14
|
+
"items": {
|
|
15
|
+
"type": "object",
|
|
16
|
+
"properties": {
|
|
17
|
+
"url": { "type": "string" }
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
},
|
|
21
|
+
"searchTerm": {
|
|
22
|
+
"position": 20,
|
|
23
|
+
"type": "string",
|
|
24
|
+
"title": "Search term",
|
|
25
|
+
"description": "Term to search for",
|
|
26
|
+
"minLength": 1,
|
|
27
|
+
"maxLength": 100
|
|
28
|
+
},
|
|
29
|
+
"categories": {
|
|
30
|
+
"position": 30,
|
|
31
|
+
"type": "array",
|
|
32
|
+
"title": "Categories",
|
|
33
|
+
"description": "List of categories to filter results",
|
|
34
|
+
"default": [],
|
|
35
|
+
"items": {
|
|
36
|
+
"type": "object",
|
|
37
|
+
"properties": {
|
|
38
|
+
"name": { "type": "string", "title": "Category name" },
|
|
39
|
+
"id": { "type": "string", "title": "Category ID" }
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
},
|
|
43
|
+
"maxPages": {
|
|
44
|
+
"position": 30,
|
|
45
|
+
"sectionCaption": "Scraping options",
|
|
46
|
+
"sectionDescription": "Configure how many pages to scrape and other options.",
|
|
47
|
+
"type": "integer",
|
|
48
|
+
"title": "Maximum pages",
|
|
49
|
+
"description": "Maximum number of pages to scrape",
|
|
50
|
+
"default": 10,
|
|
51
|
+
"minimum": 1,
|
|
52
|
+
"maximum": 1000
|
|
53
|
+
},
|
|
54
|
+
"proxy": {
|
|
55
|
+
"position": 40,
|
|
56
|
+
"type": "object",
|
|
57
|
+
"title": "Proxy configuration",
|
|
58
|
+
"description": "Proxy settings",
|
|
59
|
+
"default": { "useApifyProxy": true },
|
|
60
|
+
"properties": {
|
|
61
|
+
"useApifyProxy": { "type": "boolean", "default": true }
|
|
62
|
+
}
|
|
63
|
+
},
|
|
64
|
+
"debugMode": {
|
|
65
|
+
"position": 60,
|
|
66
|
+
"type": "boolean",
|
|
67
|
+
"title": "Debug mode",
|
|
68
|
+
"description": "Enable debug logging",
|
|
69
|
+
"default": false
|
|
70
|
+
}
|
|
71
|
+
},
|
|
72
|
+
"required": ["startUrls", "searchTerm"]
|
|
73
|
+
}
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
{
|
|
2
|
+
"actorSpecification": 1,
|
|
3
|
+
"name": "web-scraper-merged",
|
|
4
|
+
"title": "Web Scraper with merged schemas",
|
|
5
|
+
"description": "A web scraper with merged schemas.",
|
|
6
|
+
"version": "0.0",
|
|
7
|
+
"meta": {
|
|
8
|
+
"templateId": "ts-crawlee-playwright-chrome"
|
|
9
|
+
},
|
|
10
|
+
"input": "./input_schema.json",
|
|
11
|
+
"storages": {
|
|
12
|
+
"dataset": "./dataset_schema.json"
|
|
13
|
+
},
|
|
14
|
+
"dockerfile": "./Dockerfile"
|
|
15
|
+
}
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
{
|
|
2
|
+
"actorSpecification": 1,
|
|
3
|
+
"fields": {
|
|
4
|
+
"$schema": "http://json-schema.org/draft-07/schema#",
|
|
5
|
+
"title": "Dataset schema for Web Scraper",
|
|
6
|
+
"type": "object",
|
|
7
|
+
"properties": {
|
|
8
|
+
"title": {
|
|
9
|
+
"type": "string",
|
|
10
|
+
"title": "Title",
|
|
11
|
+
"description": "Page title"
|
|
12
|
+
},
|
|
13
|
+
"url": {
|
|
14
|
+
"type": "string",
|
|
15
|
+
"title": "URL",
|
|
16
|
+
"description": "Page URL"
|
|
17
|
+
},
|
|
18
|
+
"text": {
|
|
19
|
+
"type": "string",
|
|
20
|
+
"title": "Text content",
|
|
21
|
+
"description": "Extracted text"
|
|
22
|
+
},
|
|
23
|
+
"timestamp": {
|
|
24
|
+
"type": "string",
|
|
25
|
+
"title": "Timestamp",
|
|
26
|
+
"description": "When the data was scraped"
|
|
27
|
+
},
|
|
28
|
+
"type": {
|
|
29
|
+
"type": "string",
|
|
30
|
+
"title": "Type",
|
|
31
|
+
"description": "Item type"
|
|
32
|
+
}
|
|
33
|
+
},
|
|
34
|
+
"required": ["title", "url", "type"]
|
|
35
|
+
},
|
|
36
|
+
"views": {}
|
|
37
|
+
}
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
{
|
|
2
|
+
"title": "Input schema for Web Scraper merged",
|
|
3
|
+
"description": "startUrls and searchTerm are required. searchTerm must be one of the predefined values.",
|
|
4
|
+
"type": "object",
|
|
5
|
+
"schemaVersion": 1,
|
|
6
|
+
"properties": {
|
|
7
|
+
"startUrls": {
|
|
8
|
+
"type": "array",
|
|
9
|
+
"title": "Start URLs",
|
|
10
|
+
"description": "List of URLs to scrape",
|
|
11
|
+
"default": [],
|
|
12
|
+
"editor": "requestListSources"
|
|
13
|
+
},
|
|
14
|
+
"searchTerm": {
|
|
15
|
+
"type": "string",
|
|
16
|
+
"title": "Search term",
|
|
17
|
+
"description": "Term to search for",
|
|
18
|
+
"enum": ["example", "test", "sample"],
|
|
19
|
+
"enumTitles": ["Example", "Test", "Sample"]
|
|
20
|
+
},
|
|
21
|
+
"categories": {
|
|
22
|
+
"type": "array"
|
|
23
|
+
},
|
|
24
|
+
"maxPages": {
|
|
25
|
+
"type": "integer",
|
|
26
|
+
"title": "Maximum pages",
|
|
27
|
+
"description": "Maximum number of pages to scrape",
|
|
28
|
+
"default": 10,
|
|
29
|
+
"sectionCaption": "Scraping options",
|
|
30
|
+
"sectionDescription": "Configure how many pages to scrape and other options.",
|
|
31
|
+
"maximum": 1000,
|
|
32
|
+
"minimum": 1
|
|
33
|
+
},
|
|
34
|
+
"proxy": {
|
|
35
|
+
"type": "object",
|
|
36
|
+
"title": "Proxy configuration",
|
|
37
|
+
"description": "Proxy settings",
|
|
38
|
+
"default": {
|
|
39
|
+
"useApifyProxy": true
|
|
40
|
+
}
|
|
41
|
+
},
|
|
42
|
+
"maxRetries": {
|
|
43
|
+
"type": "integer",
|
|
44
|
+
"title": "Maximum retries",
|
|
45
|
+
"description": "Maximum number of retries",
|
|
46
|
+
"default": 3,
|
|
47
|
+
"maximum": 20,
|
|
48
|
+
"minimum": 1
|
|
49
|
+
},
|
|
50
|
+
"debugMode": {
|
|
51
|
+
"type": "boolean",
|
|
52
|
+
"title": "Debug mode",
|
|
53
|
+
"description": "Enable debug logging",
|
|
54
|
+
"default": false
|
|
55
|
+
}
|
|
56
|
+
},
|
|
57
|
+
"required": ["startUrls", "searchTerm"]
|
|
58
|
+
}
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
{
|
|
2
|
+
"title": "Input schema for Web Scraper merged",
|
|
3
|
+
"description": "startUrls and searchTerm are required. searchTerm must be one of the predefined values.",
|
|
4
|
+
"properties": {
|
|
5
|
+
"searchTerm": {
|
|
6
|
+
"position": 20,
|
|
7
|
+
"type": "string",
|
|
8
|
+
"title": "Search term",
|
|
9
|
+
"description": "Term to search for",
|
|
10
|
+
"enum": ["example", "test", "sample"],
|
|
11
|
+
"enumTitles": ["Example", "Test", "Sample"]
|
|
12
|
+
},
|
|
13
|
+
"categories": {
|
|
14
|
+
"position": 30,
|
|
15
|
+
"type": "array",
|
|
16
|
+
"items": {
|
|
17
|
+
"type": "object",
|
|
18
|
+
"properties": {
|
|
19
|
+
"url": { "type": "string", "title": "Category URL" }
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
},
|
|
23
|
+
"maxRetries": {
|
|
24
|
+
"position": 45,
|
|
25
|
+
"type": "integer",
|
|
26
|
+
"title": "Maximum retries",
|
|
27
|
+
"description": "Maximum number of retries",
|
|
28
|
+
"default": 3,
|
|
29
|
+
"minimum": 1,
|
|
30
|
+
"maximum": 20
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
}
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* This file was automatically generated by apify-schema-tools.
|
|
3
|
+
* DO NOT MODIFY IT BY HAND. Instead, modify the source JSONSchema file,
|
|
4
|
+
* and run apify-schema-tools' "sync" command to regenerate this file.
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
export interface DatasetItem {
|
|
8
|
+
/**
|
|
9
|
+
* Page title
|
|
10
|
+
*/
|
|
11
|
+
title: string;
|
|
12
|
+
/**
|
|
13
|
+
* Page URL
|
|
14
|
+
*/
|
|
15
|
+
url: string;
|
|
16
|
+
/**
|
|
17
|
+
* Extracted text
|
|
18
|
+
*/
|
|
19
|
+
text?: string;
|
|
20
|
+
/**
|
|
21
|
+
* When the data was scraped
|
|
22
|
+
*/
|
|
23
|
+
timestamp?: string;
|
|
24
|
+
/**
|
|
25
|
+
* Item type
|
|
26
|
+
*/
|
|
27
|
+
type: string;
|
|
28
|
+
}
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* This file was automatically generated by apify-schema-tools.
|
|
3
|
+
* DO NOT MODIFY IT BY HAND. Instead, modify the source JSONSchema file,
|
|
4
|
+
* and run apify-schema-tools' "sync" command to regenerate this file.
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
import { Actor } from "apify";
|
|
8
|
+
import type { Input } from "./input.js";
|
|
9
|
+
|
|
10
|
+
export const DEFAULT_INPUT_VALUES = {
|
|
11
|
+
startUrls: [],
|
|
12
|
+
maxPages: 10,
|
|
13
|
+
proxy: {
|
|
14
|
+
useApifyProxy: true,
|
|
15
|
+
},
|
|
16
|
+
maxRetries: 3,
|
|
17
|
+
debugMode: false,
|
|
18
|
+
};
|
|
19
|
+
|
|
20
|
+
export const REQUIRED_INPUT_FIELDS_WITHOUT_DEFAULT = ["searchTerm"];
|
|
21
|
+
|
|
22
|
+
export type InputWithDefaults = Input & {
|
|
23
|
+
/**
|
|
24
|
+
* Maximum number of pages to scrape
|
|
25
|
+
*/
|
|
26
|
+
maxPages: number;
|
|
27
|
+
/**
|
|
28
|
+
* Proxy settings
|
|
29
|
+
*/
|
|
30
|
+
proxy: {
|
|
31
|
+
useApifyProxy?: boolean;
|
|
32
|
+
};
|
|
33
|
+
/**
|
|
34
|
+
* Maximum number of retries
|
|
35
|
+
*/
|
|
36
|
+
maxRetries: number;
|
|
37
|
+
/**
|
|
38
|
+
* Enable debug logging
|
|
39
|
+
*/
|
|
40
|
+
debugMode: boolean;
|
|
41
|
+
};
|
|
42
|
+
|
|
43
|
+
export function getInputWithDefaultValues(input?: Input | null): InputWithDefaults {
|
|
44
|
+
if (Actor.isAtHome()) {
|
|
45
|
+
// The platform is supposed to fill in the default values
|
|
46
|
+
return input as InputWithDefaults;
|
|
47
|
+
}
|
|
48
|
+
if (!input) {
|
|
49
|
+
throw new Error(
|
|
50
|
+
`Input is required, because the following fields are required: ${REQUIRED_INPUT_FIELDS_WITHOUT_DEFAULT.join(", ")}`,
|
|
51
|
+
);
|
|
52
|
+
}
|
|
53
|
+
return {
|
|
54
|
+
...DEFAULT_INPUT_VALUES,
|
|
55
|
+
...(input ?? ({} as Input)),
|
|
56
|
+
};
|
|
57
|
+
}
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* This file was automatically generated by apify-schema-tools.
|
|
3
|
+
* DO NOT MODIFY IT BY HAND. Instead, modify the source JSONSchema file,
|
|
4
|
+
* and run apify-schema-tools' "sync" command to regenerate this file.
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
/**
|
|
8
|
+
* startUrls and searchTerm are required. searchTerm must be one of the predefined values.
|
|
9
|
+
*/
|
|
10
|
+
export interface Input {
|
|
11
|
+
/**
|
|
12
|
+
* List of URLs to scrape
|
|
13
|
+
*/
|
|
14
|
+
startUrls: {
|
|
15
|
+
url?: string;
|
|
16
|
+
}[];
|
|
17
|
+
/**
|
|
18
|
+
* Term to search for
|
|
19
|
+
*/
|
|
20
|
+
searchTerm: "example" | "test" | "sample";
|
|
21
|
+
categories?: {
|
|
22
|
+
url?: string;
|
|
23
|
+
}[];
|
|
24
|
+
/**
|
|
25
|
+
* Maximum number of pages to scrape
|
|
26
|
+
*/
|
|
27
|
+
maxPages?: number;
|
|
28
|
+
/**
|
|
29
|
+
* Proxy settings
|
|
30
|
+
*/
|
|
31
|
+
proxy?: {
|
|
32
|
+
useApifyProxy?: boolean;
|
|
33
|
+
};
|
|
34
|
+
/**
|
|
35
|
+
* Maximum number of retries
|
|
36
|
+
*/
|
|
37
|
+
maxRetries?: number;
|
|
38
|
+
/**
|
|
39
|
+
* Enable debug logging
|
|
40
|
+
*/
|
|
41
|
+
debugMode?: boolean;
|
|
42
|
+
}
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$schema": "http://json-schema.org/draft-07/schema#",
|
|
3
|
+
"title": "Dataset schema for Web Scraper",
|
|
4
|
+
"type": "object",
|
|
5
|
+
"properties": {
|
|
6
|
+
"title": {
|
|
7
|
+
"type": "string",
|
|
8
|
+
"title": "Title",
|
|
9
|
+
"description": "Page title"
|
|
10
|
+
},
|
|
11
|
+
"url": {
|
|
12
|
+
"type": "string",
|
|
13
|
+
"title": "URL",
|
|
14
|
+
"description": "Page URL"
|
|
15
|
+
},
|
|
16
|
+
"text": {
|
|
17
|
+
"type": "string",
|
|
18
|
+
"title": "Text content",
|
|
19
|
+
"description": "Extracted text"
|
|
20
|
+
},
|
|
21
|
+
"timestamp": {
|
|
22
|
+
"type": "string",
|
|
23
|
+
"title": "Timestamp",
|
|
24
|
+
"description": "When the data was scraped"
|
|
25
|
+
}
|
|
26
|
+
},
|
|
27
|
+
"required": ["title", "url"]
|
|
28
|
+
}
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
{
|
|
2
|
+
"title": "Input schema for Web Scraper",
|
|
3
|
+
"description": "startUrls and searchTerm are required.",
|
|
4
|
+
"type": "object",
|
|
5
|
+
"schemaVersion": 1,
|
|
6
|
+
"properties": {
|
|
7
|
+
"startUrls": {
|
|
8
|
+
"position": 10,
|
|
9
|
+
"type": "array",
|
|
10
|
+
"title": "Start URLs",
|
|
11
|
+
"description": "List of URLs to scrape",
|
|
12
|
+
"default": [],
|
|
13
|
+
"editor": "requestListSources",
|
|
14
|
+
"items": {
|
|
15
|
+
"type": "object",
|
|
16
|
+
"properties": {
|
|
17
|
+
"url": { "type": "string" }
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
},
|
|
21
|
+
"searchTerm": {
|
|
22
|
+
"position": 20,
|
|
23
|
+
"type": "string",
|
|
24
|
+
"title": "Search term",
|
|
25
|
+
"description": "Term to search for",
|
|
26
|
+
"minLength": 1,
|
|
27
|
+
"maxLength": 100
|
|
28
|
+
},
|
|
29
|
+
"categories": {
|
|
30
|
+
"position": 30,
|
|
31
|
+
"type": "array",
|
|
32
|
+
"title": "Categories",
|
|
33
|
+
"description": "List of categories to filter results",
|
|
34
|
+
"default": [],
|
|
35
|
+
"items": {
|
|
36
|
+
"type": "object",
|
|
37
|
+
"properties": {
|
|
38
|
+
"name": { "type": "string", "title": "Category name" },
|
|
39
|
+
"id": { "type": "string", "title": "Category ID" }
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
},
|
|
43
|
+
"maxPages": {
|
|
44
|
+
"position": 30,
|
|
45
|
+
"sectionCaption": "Scraping options",
|
|
46
|
+
"sectionDescription": "Configure how many pages to scrape and other options.",
|
|
47
|
+
"type": "integer",
|
|
48
|
+
"title": "Maximum pages",
|
|
49
|
+
"description": "Maximum number of pages to scrape",
|
|
50
|
+
"default": 10,
|
|
51
|
+
"minimum": 1,
|
|
52
|
+
"maximum": 1000
|
|
53
|
+
},
|
|
54
|
+
"proxy": {
|
|
55
|
+
"position": 40,
|
|
56
|
+
"type": "object",
|
|
57
|
+
"title": "Proxy configuration",
|
|
58
|
+
"description": "Proxy settings",
|
|
59
|
+
"default": { "useApifyProxy": true },
|
|
60
|
+
"properties": {
|
|
61
|
+
"useApifyProxy": { "type": "boolean", "default": true }
|
|
62
|
+
}
|
|
63
|
+
},
|
|
64
|
+
"debugMode": {
|
|
65
|
+
"position": 60,
|
|
66
|
+
"type": "boolean",
|
|
67
|
+
"title": "Debug mode",
|
|
68
|
+
"description": "Enable debug logging",
|
|
69
|
+
"default": false
|
|
70
|
+
}
|
|
71
|
+
},
|
|
72
|
+
"required": ["startUrls", "searchTerm"]
|
|
73
|
+
}
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
{
|
|
2
|
+
"actorSpecification": 1,
|
|
3
|
+
"name": "web-scraper",
|
|
4
|
+
"title": "Web Scraper",
|
|
5
|
+
"description": "A web scraper.",
|
|
6
|
+
"version": "0.0",
|
|
7
|
+
"meta": {
|
|
8
|
+
"templateId": "ts-crawlee-playwright-chrome"
|
|
9
|
+
},
|
|
10
|
+
"input": "./input_schema.json",
|
|
11
|
+
"storages": {
|
|
12
|
+
"dataset": "./dataset_schema.json"
|
|
13
|
+
},
|
|
14
|
+
"dockerfile": "./Dockerfile"
|
|
15
|
+
}
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
{
|
|
2
|
+
"actorSpecification": 1,
|
|
3
|
+
"fields": {
|
|
4
|
+
"$schema": "http://json-schema.org/draft-07/schema#",
|
|
5
|
+
"title": "Dataset schema for Web Scraper",
|
|
6
|
+
"type": "object",
|
|
7
|
+
"properties": {
|
|
8
|
+
"title": {
|
|
9
|
+
"type": "string",
|
|
10
|
+
"title": "Title",
|
|
11
|
+
"description": "Page title"
|
|
12
|
+
},
|
|
13
|
+
"url": {
|
|
14
|
+
"type": "string",
|
|
15
|
+
"title": "URL",
|
|
16
|
+
"description": "Page URL"
|
|
17
|
+
},
|
|
18
|
+
"text": {
|
|
19
|
+
"type": "string",
|
|
20
|
+
"title": "Text content",
|
|
21
|
+
"description": "Extracted text"
|
|
22
|
+
},
|
|
23
|
+
"timestamp": {
|
|
24
|
+
"type": "string",
|
|
25
|
+
"title": "Timestamp",
|
|
26
|
+
"description": "When the data was scraped"
|
|
27
|
+
}
|
|
28
|
+
},
|
|
29
|
+
"required": ["title", "url"]
|
|
30
|
+
},
|
|
31
|
+
"views": {}
|
|
32
|
+
}
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
{
|
|
2
|
+
"title": "Input schema for Web Scraper",
|
|
3
|
+
"description": "startUrls and searchTerm are required.",
|
|
4
|
+
"type": "object",
|
|
5
|
+
"schemaVersion": 1,
|
|
6
|
+
"properties": {
|
|
7
|
+
"startUrls": {
|
|
8
|
+
"type": "array",
|
|
9
|
+
"title": "Start URLs",
|
|
10
|
+
"description": "List of URLs to scrape",
|
|
11
|
+
"default": [],
|
|
12
|
+
"editor": "requestListSources"
|
|
13
|
+
},
|
|
14
|
+
"searchTerm": {
|
|
15
|
+
"type": "string",
|
|
16
|
+
"title": "Search term",
|
|
17
|
+
"description": "Term to search for",
|
|
18
|
+
"minLength": 1,
|
|
19
|
+
"maxLength": 100
|
|
20
|
+
},
|
|
21
|
+
"categories": {
|
|
22
|
+
"type": "array",
|
|
23
|
+
"title": "Categories",
|
|
24
|
+
"description": "List of categories to filter results",
|
|
25
|
+
"default": []
|
|
26
|
+
},
|
|
27
|
+
"maxPages": {
|
|
28
|
+
"type": "integer",
|
|
29
|
+
"title": "Maximum pages",
|
|
30
|
+
"description": "Maximum number of pages to scrape",
|
|
31
|
+
"default": 10,
|
|
32
|
+
"sectionCaption": "Scraping options",
|
|
33
|
+
"sectionDescription": "Configure how many pages to scrape and other options.",
|
|
34
|
+
"maximum": 1000,
|
|
35
|
+
"minimum": 1
|
|
36
|
+
},
|
|
37
|
+
"proxy": {
|
|
38
|
+
"type": "object",
|
|
39
|
+
"title": "Proxy configuration",
|
|
40
|
+
"description": "Proxy settings",
|
|
41
|
+
"default": {
|
|
42
|
+
"useApifyProxy": true
|
|
43
|
+
}
|
|
44
|
+
},
|
|
45
|
+
"debugMode": {
|
|
46
|
+
"type": "boolean",
|
|
47
|
+
"title": "Debug mode",
|
|
48
|
+
"description": "Enable debug logging",
|
|
49
|
+
"default": false
|
|
50
|
+
}
|
|
51
|
+
},
|
|
52
|
+
"required": ["startUrls", "searchTerm"]
|
|
53
|
+
}
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$schema": "http://json-schema.org/draft-07/schema#",
|
|
3
|
+
"title": "Dataset schema for Web Scraper",
|
|
4
|
+
"type": "object",
|
|
5
|
+
"properties": {
|
|
6
|
+
"title": {
|
|
7
|
+
"type": "string",
|
|
8
|
+
"title": "Title",
|
|
9
|
+
"description": "Page title"
|
|
10
|
+
},
|
|
11
|
+
"url": {
|
|
12
|
+
"type": "string",
|
|
13
|
+
"title": "URL",
|
|
14
|
+
"description": "Page URL"
|
|
15
|
+
},
|
|
16
|
+
"text": {
|
|
17
|
+
"type": "string",
|
|
18
|
+
"title": "Text content",
|
|
19
|
+
"description": "Extracted text"
|
|
20
|
+
},
|
|
21
|
+
"timestamp": {
|
|
22
|
+
"type": "string",
|
|
23
|
+
"title": "Timestamp",
|
|
24
|
+
"description": "When the data was scraped"
|
|
25
|
+
}
|
|
26
|
+
},
|
|
27
|
+
"required": ["title", "url"]
|
|
28
|
+
}
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
{
|
|
2
|
+
"title": "Input schema for Web Scraper",
|
|
3
|
+
"description": "startUrls and searchTerm are required.",
|
|
4
|
+
"type": "object",
|
|
5
|
+
"schemaVersion": 1,
|
|
6
|
+
"properties": {
|
|
7
|
+
"startUrls": {
|
|
8
|
+
"position": 10,
|
|
9
|
+
"type": "array",
|
|
10
|
+
"title": "Start URLs",
|
|
11
|
+
"description": "List of URLs to scrape",
|
|
12
|
+
"default": [],
|
|
13
|
+
"editor": "requestListSources",
|
|
14
|
+
"items": {
|
|
15
|
+
"type": "object",
|
|
16
|
+
"properties": {
|
|
17
|
+
"url": { "type": "string" }
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
},
|
|
21
|
+
"searchTerm": {
|
|
22
|
+
"position": 20,
|
|
23
|
+
"type": "string",
|
|
24
|
+
"title": "Search term",
|
|
25
|
+
"description": "Term to search for",
|
|
26
|
+
"minLength": 1,
|
|
27
|
+
"maxLength": 100
|
|
28
|
+
},
|
|
29
|
+
"categories": {
|
|
30
|
+
"position": 30,
|
|
31
|
+
"type": "array",
|
|
32
|
+
"title": "Categories",
|
|
33
|
+
"description": "List of categories to filter results",
|
|
34
|
+
"default": [],
|
|
35
|
+
"items": {
|
|
36
|
+
"type": "object",
|
|
37
|
+
"properties": {
|
|
38
|
+
"name": { "type": "string", "title": "Category name" },
|
|
39
|
+
"id": { "type": "string", "title": "Category ID" }
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
},
|
|
43
|
+
"maxPages": {
|
|
44
|
+
"position": 30,
|
|
45
|
+
"sectionCaption": "Scraping options",
|
|
46
|
+
"sectionDescription": "Configure how many pages to scrape and other options.",
|
|
47
|
+
"type": "integer",
|
|
48
|
+
"title": "Maximum pages",
|
|
49
|
+
"description": "Maximum number of pages to scrape",
|
|
50
|
+
"default": 10,
|
|
51
|
+
"minimum": 1,
|
|
52
|
+
"maximum": 1000
|
|
53
|
+
},
|
|
54
|
+
"proxy": {
|
|
55
|
+
"position": 40,
|
|
56
|
+
"type": "object",
|
|
57
|
+
"title": "Proxy configuration",
|
|
58
|
+
"description": "Proxy settings",
|
|
59
|
+
"default": { "useApifyProxy": true },
|
|
60
|
+
"properties": {
|
|
61
|
+
"useApifyProxy": { "type": "boolean", "default": true }
|
|
62
|
+
}
|
|
63
|
+
},
|
|
64
|
+
"debugMode": {
|
|
65
|
+
"position": 60,
|
|
66
|
+
"type": "boolean",
|
|
67
|
+
"title": "Debug mode",
|
|
68
|
+
"description": "Enable debug logging",
|
|
69
|
+
"default": false
|
|
70
|
+
}
|
|
71
|
+
},
|
|
72
|
+
"required": ["startUrls", "searchTerm"]
|
|
73
|
+
}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "test-package",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"apify-schema-tools": {
|
|
5
|
+
"input": ["input", "dataset"],
|
|
6
|
+
"output": ["json-schemas", "ts-types"],
|
|
7
|
+
"srcInput": "custom-src-schemas/input.json",
|
|
8
|
+
"srcDataset": "custom-src-schemas/dataset-item.json",
|
|
9
|
+
"outputTSDir": "src/custom-generated"
|
|
10
|
+
}
|
|
11
|
+
}
|