postal-code-scraper 1.0.3 → 1.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +55 -61
- package/dist/index.cjs +392 -0
- package/dist/index.d.cts +43 -0
- package/dist/index.d.ts +43 -2
- package/dist/index.js +355 -20
- package/package.json +17 -6
- package/.mocharc.json +0 -4
- package/build/test/src/index.js +0 -26
- package/build/test/src/index.js.map +0 -1
- package/build/test/src/scraper/fetchers.js +0 -49
- package/build/test/src/scraper/fetchers.js.map +0 -1
- package/build/test/src/scraper/parsers.js +0 -63
- package/build/test/src/scraper/parsers.js.map +0 -1
- package/build/test/src/scraper/queue.js +0 -69
- package/build/test/src/scraper/queue.js.map +0 -1
- package/build/test/src/scraper/scrapers.js +0 -148
- package/build/test/src/scraper/scrapers.js.map +0 -1
- package/build/test/src/types.js +0 -3
- package/build/test/src/types.js.map +0 -1
- package/build/test/src/utils/id-generator.js +0 -33
- package/build/test/src/utils/id-generator.js.map +0 -1
- package/build/test/src/utils/logger.js +0 -87
- package/build/test/src/utils/logger.js.map +0 -1
- package/build/test/tests/postal-code-scraper.test.js +0 -14
- package/build/test/tests/postal-code-scraper.test.js.map +0 -1
- package/dist/scraper/fetchers.d.ts +0 -9
- package/dist/scraper/fetchers.js +0 -48
- package/dist/scraper/parsers.d.ts +0 -7
- package/dist/scraper/parsers.js +0 -62
- package/dist/scraper/queue.d.ts +0 -12
- package/dist/scraper/queue.js +0 -67
- package/dist/scraper/scrapers.d.ts +0 -18
- package/dist/scraper/scrapers.js +0 -149
- package/dist/types.d.ts +0 -32
- package/dist/types.js +0 -2
- package/dist/utils/env-config.d.ts +0 -1
- package/dist/utils/env-config.js +0 -7
- package/dist/utils/id-generator.d.ts +0 -4
- package/dist/utils/id-generator.js +0 -26
- package/dist/utils/logger.d.ts +0 -33
- package/dist/utils/logger.js +0 -86
- package/dist/utils/string-utils.d.ts +0 -1
- package/dist/utils/string-utils.js +0 -13
- package/src/index.ts +0 -2
- package/src/scraper/fetchers.ts +0 -30
- package/src/scraper/parsers.ts +0 -67
- package/src/scraper/queue.ts +0 -55
- package/src/scraper/scrapers.ts +0 -143
- package/src/types.ts +0 -37
- package/src/utils/env-config.ts +0 -3
- package/src/utils/id-generator.ts +0 -35
- package/src/utils/logger.ts +0 -105
- package/src/utils/string-utils.ts +0 -9
- package/tests/postal-code-scraper.test.ts +0 -100
- package/tests/tsconfig.json +0 -13
- package/tsconfig.json +0 -15
package/README.md
CHANGED
|
@@ -8,11 +8,11 @@ This library uses **Puppeteer** for web scraping, **Cheerio** for HTML parsing,
|
|
|
8
8
|
|
|
9
9
|
## 🚀 Features
|
|
10
10
|
|
|
11
|
-
- Scrape **postal codes**
|
|
12
|
-
- Scrape **all countries** in one go
|
|
11
|
+
- Scrape **postal codes** for one country or all countries
|
|
13
12
|
- Save results as **JSON** files for easy integration
|
|
14
|
-
-
|
|
15
|
-
-
|
|
13
|
+
- **Region-structured** output (country → region1 → region2 → region3 → ... → postal codes)
|
|
14
|
+
- **Postal-code lookup** output (postal code → region path)
|
|
15
|
+
- Configurable options (concurrency, retries, headless mode, output directory, logging, etc.)
|
|
16
16
|
- **Fully asynchronous** for optimized performance
|
|
17
17
|
|
|
18
18
|
## 📦 Installation
|
|
@@ -45,50 +45,65 @@ import { PostalCodeScraper } from "postal-code-scraper";
|
|
|
45
45
|
const { PostalCodeScraper } = require("postal-code-scraper");
|
|
46
46
|
```
|
|
47
47
|
|
|
48
|
-
### 2️⃣ **
|
|
48
|
+
### 2️⃣ **Instantiate Scraper**
|
|
49
49
|
|
|
50
50
|
```javascript
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
51
|
+
const scraper = new PostalCodeScraper();
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
### 3️⃣ **Scrape a Single Country**
|
|
55
|
+
|
|
56
|
+
```javascript
|
|
57
|
+
import { PostalCodeScraper } from "postal-code-scraper";
|
|
54
58
|
|
|
55
|
-
|
|
59
|
+
const scraper = new PostalCodeScraper();
|
|
60
|
+
|
|
61
|
+
await scraper.scrapeCountry("Romania");
|
|
56
62
|
```
|
|
57
63
|
|
|
58
64
|
📌 **Output Files (saved in **``**):**
|
|
59
65
|
|
|
60
|
-
- `
|
|
61
|
-
- `
|
|
66
|
+
- `romania-postal-codes.json`
|
|
67
|
+
- `romania-lookup.json`
|
|
62
68
|
|
|
63
|
-
###
|
|
69
|
+
### 4️⃣ **Scrape All Countries**
|
|
64
70
|
|
|
65
71
|
```javascript
|
|
66
|
-
|
|
67
|
-
await PostalCodeScraper.scrapeCountries();
|
|
68
|
-
}
|
|
72
|
+
import { PostalCodeScraper } from "postal-code-scraper";
|
|
69
73
|
|
|
70
|
-
|
|
74
|
+
const scraper = new PostalCodeScraper();
|
|
75
|
+
|
|
76
|
+
await scraper.scrapeCountries();
|
|
71
77
|
```
|
|
72
78
|
|
|
73
79
|
📌 This will fetch postal codes for **every available country**.
|
|
74
80
|
|
|
75
|
-
###
|
|
81
|
+
### 5️⃣ **Customize Scraper Configuration**
|
|
82
|
+
|
|
83
|
+
#### 🛠 Configuration Options
|
|
84
|
+
|
|
85
|
+
| Option | Type | Default | Description |
|
|
86
|
+
| --------------- | --------------- | -------------------------------- | ------------------------------------------------------------ |
|
|
87
|
+
| `directory` | `string` | `src/data` | The directory to save data |
|
|
88
|
+
| `concurrency` | `number` | `15` | Maximum concurrent requests to process |
|
|
89
|
+
| `maxRetries` | `number` | `5` | Number of retries for failed requests |
|
|
90
|
+
| `headless` | `boolean` | `true` | Run Puppeteer in headless mode |
|
|
91
|
+
| `usePrettyName` | `boolean` | `false` | Use country pretty names instead of default names |
|
|
92
|
+
| `logger` | `object` `null` | `Logger` (custom implementation) | Handles event logging, can be set to null to disable logging |
|
|
76
93
|
|
|
77
94
|
```javascript
|
|
95
|
+
import { PostalCodeScraper } from "postal-code-scraper";
|
|
96
|
+
|
|
78
97
|
const customScraper = new PostalCodeScraper({
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
98
|
+
concurrency: 10, // Limit concurrent requests
|
|
99
|
+
maxRetries: 3, // Max retries per request
|
|
100
|
+
headless: false, // Run Puppeteer in visible mode
|
|
101
|
+
usePrettyName: true, // Store data using country pretty names
|
|
102
|
+
logger: console, // Enable console logging (set to null to disable)
|
|
103
|
+
directory: "src/data", // Output directory
|
|
85
104
|
});
|
|
86
105
|
|
|
87
|
-
|
|
88
|
-
await customScraper.scrapeCountry("Germany");
|
|
89
|
-
}
|
|
90
|
-
|
|
91
|
-
run();
|
|
106
|
+
await customScraper.scrapeCountry("Germany");
|
|
92
107
|
```
|
|
93
108
|
|
|
94
109
|
## 📁 Output Data Format
|
|
@@ -117,41 +132,21 @@ run();
|
|
|
117
132
|
|
|
118
133
|
```json
|
|
119
134
|
{
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
"tamasesti_2": [
|
|
133
|
-
"hunedoara",
|
|
134
|
-
"tamasesti"
|
|
135
|
-
],
|
|
136
|
-
"valea_4": [
|
|
137
|
-
"hunedoara",
|
|
138
|
-
"valea"
|
|
139
|
-
],
|
|
140
|
-
}
|
|
135
|
+
"postalCodeMap": {
|
|
136
|
+
"337563": "tamasesti_2",
|
|
137
|
+
"337564": "valea_4",
|
|
138
|
+
"400001": "cluj-napoca_1",
|
|
139
|
+
"400002": "cluj-napoca_1",
|
|
140
|
+
"400003": "cluj-napoca_1"
|
|
141
|
+
},
|
|
142
|
+
"regions": {
|
|
143
|
+
"cluj-napoca_1": ["cluj", "cluj-napoca"],
|
|
144
|
+
"tamasesti_2": ["hunedoara", "tamasesti"],
|
|
145
|
+
"valea_4": ["hunedoara", "valea"]
|
|
146
|
+
}
|
|
141
147
|
}
|
|
142
148
|
```
|
|
143
149
|
|
|
144
|
-
## 🛠 Configuration Options
|
|
145
|
-
|
|
146
|
-
| Option | Type | Default | Description |
|
|
147
|
-
| --------------- | ----------------------------- | -------------------------------- | ---------------------------------------------------------------------------------------------- |
|
|
148
|
-
| `directory` | `string` | `src/data` | The directory to save data |
|
|
149
|
-
| `concurrency` | `number` | `15` | Maximum concurrent requests to process |
|
|
150
|
-
| `maxRetries` | `number` | `5` | Number of retries for failed requests |
|
|
151
|
-
| `headless` | `boolean` | `true` | Run Puppeteer in headless mode |
|
|
152
|
-
| `usePrettyName` | `boolean` | `false` | Use country pretty names instead of default names |
|
|
153
|
-
| `logger` | `object` `null` | `Logger` (custom implementation) | Handles event logging, can be set to null to disable logging |
|
|
154
|
-
|
|
155
150
|
## ❓ FAQs
|
|
156
151
|
|
|
157
152
|
### **1. Where are the postal code files stored?**
|
|
@@ -191,4 +186,3 @@ Contributions are welcome! Feel free to submit a pull request or open an issue.
|
|
|
191
186
|
## 📜 License
|
|
192
187
|
|
|
193
188
|
MIT License © 2024
|
|
194
|
-
|
package/dist/index.cjs
ADDED
|
@@ -0,0 +1,392 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __create = Object.create;
|
|
3
|
+
var __defProp = Object.defineProperty;
|
|
4
|
+
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
5
|
+
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
6
|
+
var __getProtoOf = Object.getPrototypeOf;
|
|
7
|
+
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
8
|
+
var __export = (target, all) => {
|
|
9
|
+
for (var name in all)
|
|
10
|
+
__defProp(target, name, { get: all[name], enumerable: true });
|
|
11
|
+
};
|
|
12
|
+
var __copyProps = (to, from, except, desc) => {
|
|
13
|
+
if (from && typeof from === "object" || typeof from === "function") {
|
|
14
|
+
for (let key of __getOwnPropNames(from))
|
|
15
|
+
if (!__hasOwnProp.call(to, key) && key !== except)
|
|
16
|
+
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
|
|
17
|
+
}
|
|
18
|
+
return to;
|
|
19
|
+
};
|
|
20
|
+
var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
|
|
21
|
+
// If the importer is in node compatibility mode or this is not an ESM
|
|
22
|
+
// file that has been converted to a CommonJS file using a Babel-
|
|
23
|
+
// compatible transform (i.e. "__esModule" has not been set), then set
|
|
24
|
+
// "default" to the CommonJS "module.exports" for node compatibility.
|
|
25
|
+
isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
|
|
26
|
+
mod
|
|
27
|
+
));
|
|
28
|
+
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
29
|
+
|
|
30
|
+
// src/index.ts
|
|
31
|
+
var index_exports = {};
|
|
32
|
+
__export(index_exports, {
|
|
33
|
+
PostalCodeScraper: () => PostalCodeScraper
|
|
34
|
+
});
|
|
35
|
+
module.exports = __toCommonJS(index_exports);
|
|
36
|
+
|
|
37
|
+
// src/scraper/scrapers.ts
|
|
38
|
+
var import_path = __toESM(require("path"), 1);
|
|
39
|
+
var import_puppeteer = __toESM(require("puppeteer"), 1);
|
|
40
|
+
|
|
41
|
+
// src/scraper/queue.ts
|
|
42
|
+
var import_cheerio = require("cheerio");
|
|
43
|
+
var import_p_limit = __toESM(require("p-limit"), 1);
|
|
44
|
+
|
|
45
|
+
// src/scraper/parsers.ts
|
|
46
|
+
var Parser = class {
|
|
47
|
+
static parseRegions($, config) {
|
|
48
|
+
return $("h2:contains('Regions')").next(".regions").find("a").map((_index, element) => {
|
|
49
|
+
const path2 = $(element).attr("href");
|
|
50
|
+
const prettyName = $(element).text().trim();
|
|
51
|
+
if (!path2 || !prettyName) return null;
|
|
52
|
+
return {
|
|
53
|
+
name: path2.split("/").filter(Boolean).pop(),
|
|
54
|
+
prettyName,
|
|
55
|
+
path: path2
|
|
56
|
+
};
|
|
57
|
+
}).get().filter(Boolean);
|
|
58
|
+
}
|
|
59
|
+
static parsePostalCodes($, config) {
|
|
60
|
+
const codes = {};
|
|
61
|
+
$(".codes .container").each((_i, element) => {
|
|
62
|
+
const place = $(element).find(".place").text().trim();
|
|
63
|
+
const codesList = $(element).find(".code span").map((_j, el) => $(el).text().trim()).get();
|
|
64
|
+
if (place) {
|
|
65
|
+
const key = config.usePrettyName ? place : place.toLowerCase().replace(/\s+/g, "-");
|
|
66
|
+
codes[key] = codesList;
|
|
67
|
+
}
|
|
68
|
+
});
|
|
69
|
+
return codes;
|
|
70
|
+
}
|
|
71
|
+
static parseCountries($, config) {
|
|
72
|
+
return $(".regions div a").map((_i, element) => {
|
|
73
|
+
const path2 = $(element).attr("href");
|
|
74
|
+
return path2 ? { name: path2.replace(/\//g, ""), prettyName: $(element).text().trim(), path: path2 } : null;
|
|
75
|
+
}).get().filter(Boolean);
|
|
76
|
+
}
|
|
77
|
+
static parseCountryByName($, config, name) {
|
|
78
|
+
const countryElement = $(`.regions div a`).filter((_, el) => $(el).attr("href")?.replace(/\//g, "") === name.toLowerCase().trim());
|
|
79
|
+
if (!countryElement.length) return null;
|
|
80
|
+
const path2 = countryElement.attr("href");
|
|
81
|
+
const prettyName = countryElement.text().trim();
|
|
82
|
+
return path2 && prettyName ? {
|
|
83
|
+
name: path2.replace(/\//g, ""),
|
|
84
|
+
prettyName,
|
|
85
|
+
path: path2
|
|
86
|
+
} : null;
|
|
87
|
+
}
|
|
88
|
+
};
|
|
89
|
+
|
|
90
|
+
// src/utils/env-config.ts
|
|
91
|
+
var getBaseUrl = () => {
|
|
92
|
+
return "https://worldpostalcode.com";
|
|
93
|
+
};
|
|
94
|
+
|
|
95
|
+
// src/scraper/queue.ts
|
|
96
|
+
var ProcessingQueue = class {
|
|
97
|
+
constructor(fetcher, config) {
|
|
98
|
+
this.fetcher = fetcher;
|
|
99
|
+
this.config = config;
|
|
100
|
+
this.limit = (0, import_p_limit.default)(config.concurrency || 15);
|
|
101
|
+
}
|
|
102
|
+
queue = [];
|
|
103
|
+
visitedUrls = /* @__PURE__ */ new Set();
|
|
104
|
+
limit;
|
|
105
|
+
async process(startRegion, data) {
|
|
106
|
+
this.queue.push({ region: startRegion, currData: data });
|
|
107
|
+
while (this.queue.length > 0) {
|
|
108
|
+
const tasks = this.queue.map((item) => this.limit(() => this.processItem(item)));
|
|
109
|
+
this.queue = [];
|
|
110
|
+
await Promise.all(tasks);
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
async processItem(item) {
|
|
114
|
+
const url = `${getBaseUrl()}${item.region.path}`;
|
|
115
|
+
if (this.visitedUrls.has(url)) return;
|
|
116
|
+
this.visitedUrls.add(url);
|
|
117
|
+
this.config.logger?.info(`Fetching: ${url}`);
|
|
118
|
+
try {
|
|
119
|
+
const html = await this.fetcher.fetchWithRetry(url);
|
|
120
|
+
const $ = (0, import_cheerio.load)(html);
|
|
121
|
+
const regions = Parser.parseRegions($, this.config);
|
|
122
|
+
regions.forEach((region) => {
|
|
123
|
+
const key = this.config.usePrettyName ? region.prettyName : region.name;
|
|
124
|
+
item.currData[key] = {};
|
|
125
|
+
this.queue.push({
|
|
126
|
+
region,
|
|
127
|
+
currData: item.currData[key]
|
|
128
|
+
});
|
|
129
|
+
});
|
|
130
|
+
const codes = Parser.parsePostalCodes($, this.config);
|
|
131
|
+
Object.assign(item.currData, codes);
|
|
132
|
+
} catch (error) {
|
|
133
|
+
this.config.logger?.error(`Error processing ${url}:`, error);
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
};
|
|
137
|
+
|
|
138
|
+
// src/scraper/fetchers.ts
|
|
139
|
+
var Fetcher = class {
|
|
140
|
+
constructor(browser, config) {
|
|
141
|
+
this.browser = browser;
|
|
142
|
+
this.config = config;
|
|
143
|
+
}
|
|
144
|
+
async fetchHtml(url) {
|
|
145
|
+
const page = await this.browser.newPage();
|
|
146
|
+
try {
|
|
147
|
+
page.setDefaultNavigationTimeout(6e4);
|
|
148
|
+
await page.goto(url, { waitUntil: "domcontentloaded" });
|
|
149
|
+
return await page.content();
|
|
150
|
+
} finally {
|
|
151
|
+
await page.close();
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
async fetchWithRetry(url, retries = this.config.maxRetries || 5) {
|
|
155
|
+
try {
|
|
156
|
+
return await this.fetchHtml(url);
|
|
157
|
+
} catch (error) {
|
|
158
|
+
this.config.logger?.warn(`Retrying (${this.config.maxRetries - retries + 1}) for: ${url}`);
|
|
159
|
+
if (retries > 0) {
|
|
160
|
+
await new Promise((resolve) => setTimeout(resolve, Math.random() * 7e3 + 5e3));
|
|
161
|
+
return this.fetchWithRetry(url, retries - 1);
|
|
162
|
+
}
|
|
163
|
+
throw new Error(`Failed to fetch: ${url} after ${this.config.maxRetries} attempts`);
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
};
|
|
167
|
+
|
|
168
|
+
// src/utils/id-generator.ts
|
|
169
|
+
var createRegionIdGenerator = () => {
|
|
170
|
+
const regionRegistry = /* @__PURE__ */ new Map();
|
|
171
|
+
const counterMap = /* @__PURE__ */ new Map();
|
|
172
|
+
return (regions) => {
|
|
173
|
+
const normalized = regions.map(
|
|
174
|
+
(region) => region.trim().toLowerCase().normalize("NFD").replace(/[\u0300-\u036f]/g, "").replace(/\s+/g, "_")
|
|
175
|
+
);
|
|
176
|
+
const compositeKey = normalized.join("|");
|
|
177
|
+
if (regionRegistry.has(compositeKey)) {
|
|
178
|
+
return regionRegistry.get(compositeKey);
|
|
179
|
+
}
|
|
180
|
+
const baseName = normalized[normalized.length - 1];
|
|
181
|
+
const count = (counterMap.get(baseName) || 0) + 1;
|
|
182
|
+
counterMap.set(baseName, count);
|
|
183
|
+
const newId = `${baseName}_${count}`;
|
|
184
|
+
regionRegistry.set(compositeKey, newId);
|
|
185
|
+
return newId;
|
|
186
|
+
};
|
|
187
|
+
};
|
|
188
|
+
|
|
189
|
+
// src/scraper/scrapers.ts
|
|
190
|
+
var import_fs = require("fs");
|
|
191
|
+
var import_cheerio2 = require("cheerio");
|
|
192
|
+
|
|
193
|
+
// src/utils/string-utils.ts
|
|
194
|
+
var normalizeString = (str) => {
|
|
195
|
+
return str.trim().toLowerCase().normalize("NFD").replace(/[\u0300-\u036f]/g, "").replace(/\s+/g, "-").replace(/[^a-z0-9.-]/g, "");
|
|
196
|
+
};
|
|
197
|
+
|
|
198
|
+
// src/utils/logger.ts
|
|
199
|
+
var Logger = class _Logger {
|
|
200
|
+
static logLevel = "info";
|
|
201
|
+
static useColors = true;
|
|
202
|
+
static prefix = "[POSTAL-CODE-SCRAPER]";
|
|
203
|
+
static instance;
|
|
204
|
+
static configure(config) {
|
|
205
|
+
if (config.level) this.logLevel = config.level;
|
|
206
|
+
if (config.colors !== void 0) this.useColors = config.colors;
|
|
207
|
+
if (config.prefix) this.prefix = config.prefix;
|
|
208
|
+
if (config.logger) this.instance = config.logger;
|
|
209
|
+
}
|
|
210
|
+
static getInstance() {
|
|
211
|
+
return this.instance || new _Logger();
|
|
212
|
+
}
|
|
213
|
+
static debug(message, ...args) {
|
|
214
|
+
this.log("debug", message, args);
|
|
215
|
+
}
|
|
216
|
+
static info(message, ...args) {
|
|
217
|
+
this.log("info", message, args);
|
|
218
|
+
}
|
|
219
|
+
static warn(message, ...args) {
|
|
220
|
+
this.log("warn", message, args);
|
|
221
|
+
}
|
|
222
|
+
static error(message, ...args) {
|
|
223
|
+
this.log("error", message, args);
|
|
224
|
+
}
|
|
225
|
+
static shouldLog(level) {
|
|
226
|
+
if (this.logLevel === "silent") return false;
|
|
227
|
+
const levels = ["error", "warn", "info", "debug"];
|
|
228
|
+
return levels.indexOf(level) <= levels.indexOf(this.logLevel);
|
|
229
|
+
}
|
|
230
|
+
static log(level, message, args) {
|
|
231
|
+
if (!this.shouldLog(level)) return;
|
|
232
|
+
const logger = this.getInstance();
|
|
233
|
+
const formatted = this.formatMessage(level, message);
|
|
234
|
+
logger[level](formatted, ...args);
|
|
235
|
+
}
|
|
236
|
+
static formatMessage(level, message) {
|
|
237
|
+
const timestamp = (/* @__PURE__ */ new Date()).toISOString();
|
|
238
|
+
const levelColor = this.getLevelColor(level);
|
|
239
|
+
const messageColor = this.useColors ? "\x1B[37m" : "";
|
|
240
|
+
return [
|
|
241
|
+
this.useColors ? "\x1B[90m" : "",
|
|
242
|
+
`${this.prefix} `,
|
|
243
|
+
`${timestamp} `,
|
|
244
|
+
levelColor,
|
|
245
|
+
`[${level.toUpperCase()}]`,
|
|
246
|
+
this.useColors ? "\x1B[0m" : "",
|
|
247
|
+
messageColor,
|
|
248
|
+
` ${message}`,
|
|
249
|
+
this.useColors ? "\x1B[0m" : ""
|
|
250
|
+
].join("");
|
|
251
|
+
}
|
|
252
|
+
static getLevelColor(level) {
|
|
253
|
+
if (!this.useColors) return "";
|
|
254
|
+
return {
|
|
255
|
+
error: "\x1B[31m",
|
|
256
|
+
// Red
|
|
257
|
+
warn: "\x1B[33m",
|
|
258
|
+
// Yellow
|
|
259
|
+
info: "\x1B[36m",
|
|
260
|
+
// Cyan
|
|
261
|
+
debug: "\x1B[35m"
|
|
262
|
+
// Magenta
|
|
263
|
+
}[level];
|
|
264
|
+
}
|
|
265
|
+
// Instance methods to implement LoggerInterface
|
|
266
|
+
debug(message, ...args) {
|
|
267
|
+
console.debug(message, ...args);
|
|
268
|
+
}
|
|
269
|
+
info(message, ...args) {
|
|
270
|
+
console.log(message, ...args);
|
|
271
|
+
}
|
|
272
|
+
warn(message, ...args) {
|
|
273
|
+
console.warn(message, ...args);
|
|
274
|
+
}
|
|
275
|
+
error(message, ...args) {
|
|
276
|
+
console.error(message, ...args);
|
|
277
|
+
}
|
|
278
|
+
};
|
|
279
|
+
|
|
280
|
+
// src/scraper/scrapers.ts
|
|
281
|
+
var PostalCodeScraper = class {
|
|
282
|
+
constructor(config = {}) {
|
|
283
|
+
this.config = config;
|
|
284
|
+
this.config = {
|
|
285
|
+
concurrency: 15,
|
|
286
|
+
maxRetries: 5,
|
|
287
|
+
headless: true,
|
|
288
|
+
directory: "src/data",
|
|
289
|
+
logger: Logger,
|
|
290
|
+
usePrettyName: false,
|
|
291
|
+
...config
|
|
292
|
+
};
|
|
293
|
+
}
|
|
294
|
+
browser;
|
|
295
|
+
queue;
|
|
296
|
+
fetcher;
|
|
297
|
+
async scrapeCountry(countryName) {
|
|
298
|
+
await this.initBrowser();
|
|
299
|
+
try {
|
|
300
|
+
const country = await this.getCountryDetails(countryName);
|
|
301
|
+
if (!country) {
|
|
302
|
+
this.config.logger?.warn(`Country not found: ${countryName}`);
|
|
303
|
+
return null;
|
|
304
|
+
}
|
|
305
|
+
const data = {};
|
|
306
|
+
await this.queue.process(country, data);
|
|
307
|
+
this.saveData(data, `${country.name}-postal-codes.json`, this.config.directory);
|
|
308
|
+
const postalCodeLookup = this.generatePostalCodeLookup(data);
|
|
309
|
+
this.saveData(postalCodeLookup, `${country.name}-lookup.json`, this.config.directory);
|
|
310
|
+
} finally {
|
|
311
|
+
await this.cleanup();
|
|
312
|
+
}
|
|
313
|
+
}
|
|
314
|
+
async scrapeCountries() {
|
|
315
|
+
await this.initBrowser();
|
|
316
|
+
try {
|
|
317
|
+
const countries = await this.getCountriesDetails();
|
|
318
|
+
if (countries.length === 0) {
|
|
319
|
+
this.config.logger?.warn("No countries found.");
|
|
320
|
+
return null;
|
|
321
|
+
}
|
|
322
|
+
for (const country of countries) {
|
|
323
|
+
const key = this.config.usePrettyName ? country.prettyName : country.name;
|
|
324
|
+
const countryData = {};
|
|
325
|
+
this.config.logger?.info(`Processing country: ${key}`);
|
|
326
|
+
await this.queue.process(country, countryData);
|
|
327
|
+
this.saveData(countryData, `${key}-postal-codes.json`, this.config.directory);
|
|
328
|
+
const postalCodeLookup = this.generatePostalCodeLookup(countryData);
|
|
329
|
+
this.saveData(postalCodeLookup, `${key}-lookup.json`, this.config.directory);
|
|
330
|
+
}
|
|
331
|
+
} finally {
|
|
332
|
+
await this.cleanup();
|
|
333
|
+
}
|
|
334
|
+
}
|
|
335
|
+
async initBrowser() {
|
|
336
|
+
this.browser = await import_puppeteer.default.launch({ headless: this.config.headless });
|
|
337
|
+
this.fetcher = new Fetcher(this.browser, this.config);
|
|
338
|
+
this.queue = new ProcessingQueue(this.fetcher, this.config);
|
|
339
|
+
}
|
|
340
|
+
async getCountryDetails(name) {
|
|
341
|
+
try {
|
|
342
|
+
const html = await this.fetcher.fetchWithRetry(getBaseUrl());
|
|
343
|
+
return Parser.parseCountryByName((0, import_cheerio2.load)(html), this.config, name);
|
|
344
|
+
} catch (error) {
|
|
345
|
+
this.config.logger?.error(`Error fetching country details: ${name}`, error);
|
|
346
|
+
return null;
|
|
347
|
+
}
|
|
348
|
+
}
|
|
349
|
+
async getCountriesDetails() {
|
|
350
|
+
try {
|
|
351
|
+
const html = await this.fetcher.fetchWithRetry(getBaseUrl());
|
|
352
|
+
return Parser.parseCountries((0, import_cheerio2.load)(html), this.config);
|
|
353
|
+
} catch (error) {
|
|
354
|
+
this.config.logger?.error("Error fetching countries details", error);
|
|
355
|
+
return [];
|
|
356
|
+
}
|
|
357
|
+
}
|
|
358
|
+
generatePostalCodeLookup(data) {
|
|
359
|
+
return this.buildLookup(data, createRegionIdGenerator());
|
|
360
|
+
}
|
|
361
|
+
buildLookup(regionObj, idGenerator, acc = [], result = { postalCodeMap: {}, regions: {} }) {
|
|
362
|
+
if (Array.isArray(regionObj)) {
|
|
363
|
+
for (const item of regionObj) {
|
|
364
|
+
const id = idGenerator(acc);
|
|
365
|
+
result.postalCodeMap[item] = id;
|
|
366
|
+
result.regions[id] = [...acc];
|
|
367
|
+
}
|
|
368
|
+
} else if (typeof regionObj === "object" && regionObj !== null) {
|
|
369
|
+
for (const [regionKey, regionValue] of Object.entries(regionObj)) {
|
|
370
|
+
this.buildLookup(regionValue, idGenerator, [...acc, regionKey], result);
|
|
371
|
+
}
|
|
372
|
+
}
|
|
373
|
+
return result;
|
|
374
|
+
}
|
|
375
|
+
saveData(data, fileName, directory = "src/data") {
|
|
376
|
+
try {
|
|
377
|
+
(0, import_fs.mkdirSync)(directory, { recursive: true });
|
|
378
|
+
const filePath = import_path.default.join(directory, normalizeString(fileName));
|
|
379
|
+
(0, import_fs.writeFileSync)(filePath, JSON.stringify(data, null, 2), { flag: "w" });
|
|
380
|
+
this.config.logger?.info(`Saved data to ${filePath}`);
|
|
381
|
+
} catch (error) {
|
|
382
|
+
this.config.logger?.error(`Error saving data to ${fileName}`, error);
|
|
383
|
+
}
|
|
384
|
+
}
|
|
385
|
+
async cleanup() {
|
|
386
|
+
await this.browser?.close();
|
|
387
|
+
}
|
|
388
|
+
};
|
|
389
|
+
// Annotate the CommonJS export names for ESM import in node:
|
|
390
|
+
0 && (module.exports = {
|
|
391
|
+
PostalCodeScraper
|
|
392
|
+
});
|
package/dist/index.d.cts
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
type Region = {
|
|
2
|
+
path: string;
|
|
3
|
+
name: string;
|
|
4
|
+
prettyName: string;
|
|
5
|
+
};
|
|
6
|
+
type ScraperConfig = {
|
|
7
|
+
usePrettyName?: boolean;
|
|
8
|
+
directory?: string;
|
|
9
|
+
concurrency?: number;
|
|
10
|
+
maxRetries?: number;
|
|
11
|
+
headless?: boolean;
|
|
12
|
+
logger?: any;
|
|
13
|
+
};
|
|
14
|
+
interface LookupData {
|
|
15
|
+
postalCodeMap: {
|
|
16
|
+
[postalCode: string]: string;
|
|
17
|
+
};
|
|
18
|
+
regions: {
|
|
19
|
+
[code: string]: string[];
|
|
20
|
+
};
|
|
21
|
+
}
|
|
22
|
+
interface RegionData {
|
|
23
|
+
[key: string]: RegionData | string[];
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
declare class PostalCodeScraper {
|
|
27
|
+
private config;
|
|
28
|
+
private browser;
|
|
29
|
+
private queue;
|
|
30
|
+
private fetcher;
|
|
31
|
+
constructor(config?: ScraperConfig);
|
|
32
|
+
scrapeCountry(countryName: string): Promise<null | undefined>;
|
|
33
|
+
scrapeCountries(): Promise<null | undefined>;
|
|
34
|
+
private initBrowser;
|
|
35
|
+
private getCountryDetails;
|
|
36
|
+
private getCountriesDetails;
|
|
37
|
+
private generatePostalCodeLookup;
|
|
38
|
+
private buildLookup;
|
|
39
|
+
private saveData;
|
|
40
|
+
private cleanup;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
export { type LookupData, PostalCodeScraper, type Region, type RegionData, type ScraperConfig };
|
package/dist/index.d.ts
CHANGED
|
@@ -1,2 +1,43 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
1
|
+
type Region = {
|
|
2
|
+
path: string;
|
|
3
|
+
name: string;
|
|
4
|
+
prettyName: string;
|
|
5
|
+
};
|
|
6
|
+
type ScraperConfig = {
|
|
7
|
+
usePrettyName?: boolean;
|
|
8
|
+
directory?: string;
|
|
9
|
+
concurrency?: number;
|
|
10
|
+
maxRetries?: number;
|
|
11
|
+
headless?: boolean;
|
|
12
|
+
logger?: any;
|
|
13
|
+
};
|
|
14
|
+
interface LookupData {
|
|
15
|
+
postalCodeMap: {
|
|
16
|
+
[postalCode: string]: string;
|
|
17
|
+
};
|
|
18
|
+
regions: {
|
|
19
|
+
[code: string]: string[];
|
|
20
|
+
};
|
|
21
|
+
}
|
|
22
|
+
interface RegionData {
|
|
23
|
+
[key: string]: RegionData | string[];
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
declare class PostalCodeScraper {
|
|
27
|
+
private config;
|
|
28
|
+
private browser;
|
|
29
|
+
private queue;
|
|
30
|
+
private fetcher;
|
|
31
|
+
constructor(config?: ScraperConfig);
|
|
32
|
+
scrapeCountry(countryName: string): Promise<null | undefined>;
|
|
33
|
+
scrapeCountries(): Promise<null | undefined>;
|
|
34
|
+
private initBrowser;
|
|
35
|
+
private getCountryDetails;
|
|
36
|
+
private getCountriesDetails;
|
|
37
|
+
private generatePostalCodeLookup;
|
|
38
|
+
private buildLookup;
|
|
39
|
+
private saveData;
|
|
40
|
+
private cleanup;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
export { type LookupData, PostalCodeScraper, type Region, type RegionData, type ScraperConfig };
|