postal-code-scraper 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.mocharc.json +4 -0
- package/LICENSE +21 -0
- package/README.md +194 -0
- package/build/test/src/index.js +26 -0
- package/build/test/src/index.js.map +1 -0
- package/build/test/src/scraper/fetchers.js +49 -0
- package/build/test/src/scraper/fetchers.js.map +1 -0
- package/build/test/src/scraper/parsers.js +63 -0
- package/build/test/src/scraper/parsers.js.map +1 -0
- package/build/test/src/scraper/queue.js +69 -0
- package/build/test/src/scraper/queue.js.map +1 -0
- package/build/test/src/scraper/scrapers.js +148 -0
- package/build/test/src/scraper/scrapers.js.map +1 -0
- package/build/test/src/types.js +3 -0
- package/build/test/src/types.js.map +1 -0
- package/build/test/src/utils/id-generator.js +33 -0
- package/build/test/src/utils/id-generator.js.map +1 -0
- package/build/test/src/utils/logger.js +87 -0
- package/build/test/src/utils/logger.js.map +1 -0
- package/build/test/tests/postal-code-scraper.test.js +14 -0
- package/build/test/tests/postal-code-scraper.test.js.map +1 -0
- package/dist/index.d.ts +3 -0
- package/dist/index.js +25 -0
- package/dist/scraper/fetchers.d.ts +9 -0
- package/dist/scraper/fetchers.js +48 -0
- package/dist/scraper/parsers.d.ts +7 -0
- package/dist/scraper/parsers.js +62 -0
- package/dist/scraper/queue.d.ts +12 -0
- package/dist/scraper/queue.js +67 -0
- package/dist/scraper/scrapers.d.ts +19 -0
- package/dist/scraper/scrapers.js +149 -0
- package/dist/types.d.ts +32 -0
- package/dist/types.js +2 -0
- package/dist/utils/env-config.d.ts +1 -0
- package/dist/utils/env-config.js +7 -0
- package/dist/utils/id-generator.d.ts +4 -0
- package/dist/utils/id-generator.js +26 -0
- package/dist/utils/logger.d.ts +33 -0
- package/dist/utils/logger.js +86 -0
- package/dist/utils/string-utils.d.ts +1 -0
- package/dist/utils/string-utils.js +13 -0
- package/package.json +61 -0
- package/src/index.ts +3 -0
- package/src/scraper/fetchers.ts +30 -0
- package/src/scraper/parsers.ts +67 -0
- package/src/scraper/queue.ts +55 -0
- package/src/scraper/scrapers.ts +143 -0
- package/src/types.ts +37 -0
- package/src/utils/env-config.ts +3 -0
- package/src/utils/id-generator.ts +35 -0
- package/src/utils/logger.ts +105 -0
- package/src/utils/string-utils.ts +9 -0
- package/tests/postal-code-scraper.test.ts +100 -0
- package/tests/tsconfig.json +13 -0
- package/tsconfig.json +15 -0
package/.mocharc.json
ADDED
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 sasiasas
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
# Postal Code Scraper
|
|
2
|
+
|
|
3
|
+
## 📌 Overview
|
|
4
|
+
|
|
5
|
+
**Postal Code Scraper** is an automated web scraper designed to extract postal code data from countries worldwide. It efficiently fetches postal codes and organizes them into structured JSON files for easy use in applications.
|
|
6
|
+
|
|
7
|
+
This library uses **Puppeteer** for web scraping and **Cheerio** for HTML parsing, ensuring accurate and efficient data extraction.
|
|
8
|
+
|
|
9
|
+
## 🚀 Features
|
|
10
|
+
|
|
11
|
+
- Scrape **postal codes** from any country
|
|
12
|
+
- Scrape **all countries** in one go
|
|
13
|
+
- Save results as **JSON** files for easy integration
|
|
14
|
+
- Configurable settings (concurrency, retries, headless mode, etc.) <- read more below
|
|
15
|
+
- Structured **postal code lookup** generation
|
|
16
|
+
- **Fully asynchronous** for optimized performance
|
|
17
|
+
|
|
18
|
+
## 📦 Installation
|
|
19
|
+
|
|
20
|
+
Install via npm:
|
|
21
|
+
|
|
22
|
+
```sh
|
|
23
|
+
npm install postal-code-scraper
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
Or with Yarn:
|
|
27
|
+
|
|
28
|
+
```sh
|
|
29
|
+
yarn add postal-code-scraper
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
## 📖 Usage Guide
|
|
33
|
+
|
|
34
|
+
### 1️⃣ **Import the Library**
|
|
35
|
+
|
|
36
|
+
#### ES Module (Recommended):
|
|
37
|
+
|
|
38
|
+
```javascript
|
|
39
|
+
import { PostalCodeScraper } from "postal-code-scraper";
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
#### CommonJS:
|
|
43
|
+
|
|
44
|
+
```javascript
|
|
45
|
+
const { PostalCodeScraper } = require("postal-code-scraper");
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
### 2️⃣ **Scrape a Single Country**
|
|
49
|
+
|
|
50
|
+
```javascript
|
|
51
|
+
async function scrapeSingleCountry() {
|
|
52
|
+
await PostalCodeScraper.scrapeCountry("Canada");
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
scrapeSingleCountry();
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
📌 **Output Files (saved in **``**):**
|
|
59
|
+
|
|
60
|
+
- `Canada-postal-codes.json`
|
|
61
|
+
- `Canada-lookup.json`
|
|
62
|
+
|
|
63
|
+
### 3️⃣ **Scrape All Countries**
|
|
64
|
+
|
|
65
|
+
```javascript
|
|
66
|
+
async function scrapeAllCountries() {
|
|
67
|
+
await PostalCodeScraper.scrapeCountries();
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
scrapeAllCountries();
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
📌 This will fetch postal codes for **every available country**.
|
|
74
|
+
|
|
75
|
+
### 4️⃣ **Customize Scraper Configuration**
|
|
76
|
+
|
|
77
|
+
```javascript
|
|
78
|
+
const customScraper = new PostalCodeScraper({
|
|
79
|
+
concurrency: 10, // Limit concurrent requests
|
|
80
|
+
maxRetries: 3, // Max retries per request (if a request fails -> so we don't lose data)
|
|
81
|
+
headless: false, // Run Puppeteer in visible mode
|
|
82
|
+
usePrettyName: true, // Store data using country pretty names
|
|
83
|
+
logger: console // Enable console logging (default is own implemented)
|
|
84
|
+
directory: 'src/data' // Choose the folder where you want to save the data
|
|
85
|
+
});
|
|
86
|
+
|
|
87
|
+
async function run() {
|
|
88
|
+
await customScraper.scrapeCountry("Germany");
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
run();
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
## 📁 Output Data Format
|
|
95
|
+
|
|
96
|
+
### 🔹 `romania-postal-codes.json`
|
|
97
|
+
|
|
98
|
+
```json
|
|
99
|
+
{
|
|
100
|
+
"cluj": {
|
|
101
|
+
"agarbiciu": [
|
|
102
|
+
"407146"
|
|
103
|
+
],
|
|
104
|
+
"aghiresu": [
|
|
105
|
+
"407005"
|
|
106
|
+
],
|
|
107
|
+
"cluj-napoca": [
|
|
108
|
+
"400001",
|
|
109
|
+
"400002",
|
|
110
|
+
"400003",
|
|
111
|
+
"...",
|
|
112
|
+
],
|
|
113
|
+
}
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
### 🔹 `romania-lookup.json`
|
|
117
|
+
|
|
118
|
+
```json
|
|
119
|
+
{
|
|
120
|
+
"postalCodeMap": {
|
|
121
|
+
"337563": "tamasesti_2",
|
|
122
|
+
"337564": "valea_4",
|
|
123
|
+
"400001": "cluj-napoca_1",
|
|
124
|
+
"400002": "cluj-napoca_1",
|
|
125
|
+
"400003": "cluj-napoca_1",
|
|
126
|
+
},
|
|
127
|
+
"regions": {
|
|
128
|
+
"cluj-napoca_1": [
|
|
129
|
+
"cluj",
|
|
130
|
+
"cluj-napoca"
|
|
131
|
+
],
|
|
132
|
+
"tamasesti_2": [
|
|
133
|
+
"hunedoara",
|
|
134
|
+
"tamasesti"
|
|
135
|
+
],
|
|
136
|
+
"valea_4": [
|
|
137
|
+
"hunedoara",
|
|
138
|
+
"valea"
|
|
139
|
+
],
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
## 🛠 Configuration Options
|
|
145
|
+
|
|
146
|
+
| Option | Type | Default | Description |
|
|
147
|
+
| --------------- | ----------------------------- | -------------------------------- | ---------------------------------------------------------------------------------------------- |
|
|
148
|
+
| `directory` | `string` | `src/data` | The directory to save data |
|
|
149
|
+
| `concurrency` | `number` | `15` | Maximum concurrent requests to process |
|
|
150
|
+
| `maxRetries` | `number` | `5` | Number of retries for failed requests |
|
|
151
|
+
| `headless` | `boolean` | `true` | Run Puppeteer in headless mode |
|
|
152
|
+
| `usePrettyName` | `boolean` | `false` | Use country pretty names instead of default names |
|
|
153
|
+
| `logger` | `object` `null` | `Logger` (custom implementation) | Handles event logging, can be set to null to disable logging |
|
|
154
|
+
|
|
155
|
+
## ❓ FAQs
|
|
156
|
+
|
|
157
|
+
### **1. Where are the postal code files stored?**
|
|
158
|
+
|
|
159
|
+
By default, they are saved in:
|
|
160
|
+
|
|
161
|
+
```
|
|
162
|
+
src/data/
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
Each country has two JSON files: one with raw postal codes and another with a structured lookup.
|
|
166
|
+
|
|
167
|
+
### **2. Can I scrape multiple countries at once?**
|
|
168
|
+
|
|
169
|
+
Yes, using `scrapeCountries()`, which scrapes **all countries** automatically.
|
|
170
|
+
|
|
171
|
+
### **3. Can I change the output directory?**
|
|
172
|
+
|
|
173
|
+
Yes, by changing the `directory` attribute in configuration.
|
|
174
|
+
|
|
175
|
+
### **4. Does this package work with TypeScript?**
|
|
176
|
+
|
|
177
|
+
Yes! The package includes TypeScript types for better development experience.
|
|
178
|
+
|
|
179
|
+
### **5. How can I turn off logging?**
|
|
180
|
+
|
|
181
|
+
You, by setting the `logger` attribute in configuration to `null`.
|
|
182
|
+
|
|
183
|
+
## 🏗 Future Enhancements
|
|
184
|
+
|
|
185
|
+
- ✅ Support for exporting data as CSV
|
|
186
|
+
|
|
187
|
+
## 🤝 Contributing
|
|
188
|
+
|
|
189
|
+
Contributions are welcome! Feel free to submit a pull request or open an issue.
|
|
190
|
+
|
|
191
|
+
## 📜 License
|
|
192
|
+
|
|
193
|
+
MIT License © 2024
|
|
194
|
+
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __exportStar = (this && this.__exportStar) || function(m, exports) {
|
|
14
|
+
for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
|
|
15
|
+
};
|
|
16
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
17
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
18
|
+
};
|
|
19
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
20
|
+
exports.default = exports.PostalCodeScraper = void 0;
|
|
21
|
+
__exportStar(require("./types"), exports);
|
|
22
|
+
var scrapers_1 = require("./scraper/scrapers");
|
|
23
|
+
Object.defineProperty(exports, "PostalCodeScraper", { enumerable: true, get: function () { return scrapers_1.PostalCodeScraper; } });
|
|
24
|
+
var scrapers_2 = require("./scraper/scrapers");
|
|
25
|
+
Object.defineProperty(exports, "default", { enumerable: true, get: function () { return __importDefault(scrapers_2).default; } });
|
|
26
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../../src/index.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;AAAA,0CAAwB;AACxB,+CAAuD;AAA9C,6GAAA,iBAAiB,OAAA;AAC1B,+CAA6C;AAApC,oHAAA,OAAO,OAAA"}
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
3
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
4
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
5
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
6
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
7
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
8
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
9
|
+
});
|
|
10
|
+
};
|
|
11
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
12
|
+
exports.Fetcher = void 0;
|
|
13
|
+
class Fetcher {
|
|
14
|
+
constructor(browser, config) {
|
|
15
|
+
this.browser = browser;
|
|
16
|
+
this.config = config;
|
|
17
|
+
}
|
|
18
|
+
fetchHtml(url) {
|
|
19
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
20
|
+
const page = yield this.browser.newPage();
|
|
21
|
+
try {
|
|
22
|
+
page.setDefaultNavigationTimeout(60000);
|
|
23
|
+
yield page.goto(url, { waitUntil: "domcontentloaded" });
|
|
24
|
+
return yield page.content();
|
|
25
|
+
}
|
|
26
|
+
finally {
|
|
27
|
+
yield page.close();
|
|
28
|
+
}
|
|
29
|
+
});
|
|
30
|
+
}
|
|
31
|
+
fetchWithRetry(url_1) {
|
|
32
|
+
return __awaiter(this, arguments, void 0, function* (url, retries = this.config.maxRetries || 5) {
|
|
33
|
+
var _a;
|
|
34
|
+
try {
|
|
35
|
+
return yield this.fetchHtml(url);
|
|
36
|
+
}
|
|
37
|
+
catch (error) {
|
|
38
|
+
(_a = this.config.logger) === null || _a === void 0 ? void 0 : _a.warn(`Retrying (${this.config.maxRetries - retries + 1}) for: ${url}`);
|
|
39
|
+
if (retries > 0) {
|
|
40
|
+
yield new Promise((resolve) => setTimeout(resolve, Math.random() * 7000 + 5000));
|
|
41
|
+
return this.fetchWithRetry(url, retries - 1);
|
|
42
|
+
}
|
|
43
|
+
throw new Error(`Failed to fetch: ${url} after ${this.config.maxRetries} attempts`);
|
|
44
|
+
}
|
|
45
|
+
});
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
exports.Fetcher = Fetcher;
|
|
49
|
+
//# sourceMappingURL=fetchers.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"fetchers.js","sourceRoot":"","sources":["../../../../src/scraper/fetchers.ts"],"names":[],"mappings":";;;;;;;;;;;;AAGA,MAAa,OAAO;IACnB,YAAoB,OAAgB,EAAU,MAAqB;QAA/C,YAAO,GAAP,OAAO,CAAS;QAAU,WAAM,GAAN,MAAM,CAAe;IAAG,CAAC;IAEjE,SAAS,CAAC,GAAW;;YAC1B,MAAM,IAAI,GAAG,MAAM,IAAI,CAAC,OAAO,CAAC,OAAO,EAAE,CAAC;YAC1C,IAAI,CAAC;gBACJ,IAAI,CAAC,2BAA2B,CAAC,KAAK,CAAC,CAAC;gBACxC,MAAM,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE,EAAE,SAAS,EAAE,kBAAkB,EAAE,CAAC,CAAC;gBACxD,OAAO,MAAM,IAAI,CAAC,OAAO,EAAE,CAAC;YAC7B,CAAC;oBAAS,CAAC;gBACV,MAAM,IAAI,CAAC,KAAK,EAAE,CAAC;YACpB,CAAC;QACF,CAAC;KAAA;IAEK,cAAc;6DAAC,GAAW,EAAE,OAAO,GAAG,IAAI,CAAC,MAAM,CAAC,UAAU,IAAI,CAAC;;YACtE,IAAI,CAAC;gBACJ,OAAO,MAAM,IAAI,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC;YAClC,CAAC;YAAC,OAAO,KAAK,EAAE,CAAC;gBAChB,MAAA,IAAI,CAAC,MAAM,CAAC,MAAM,0CAAE,IAAI,CAAC,aAAa,IAAI,CAAC,MAAM,CAAC,UAAW,GAAG,OAAO,GAAG,CAAC,UAAU,GAAG,EAAE,CAAC,CAAC;gBAC5F,IAAI,OAAO,GAAG,CAAC,EAAE,CAAC;oBACjB,MAAM,IAAI,OAAO,CAAC,CAAC,OAAO,EAAE,EAAE,CAAC,UAAU,CAAC,OAAO,EAAE,IAAI,CAAC,MAAM,EAAE,GAAG,IAAI,GAAG,IAAI,CAAC,CAAC,CAAC;oBACjF,OAAO,IAAI,CAAC,cAAc,CAAC,GAAG,EAAE,OAAO,GAAG,CAAC,CAAC,CAAC;gBAC9C,CAAC;gBACD,MAAM,IAAI,KAAK,CAAC,oBAAoB,GAAG,UAAU,IAAI,CAAC,MAAM,CAAC,UAAU,WAAW,CAAC,CAAC;YACrF,CAAC;QACF,CAAC;KAAA;CACD;AA1BD,0BA0BC"}
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.Parser = void 0;
|
|
4
|
+
class Parser {
|
|
5
|
+
static parseRegions($, config) {
|
|
6
|
+
return $("h2:contains('Regions')")
|
|
7
|
+
.next(".regions")
|
|
8
|
+
.find("a")
|
|
9
|
+
.map((_index, element) => {
|
|
10
|
+
const path = $(element).attr("href");
|
|
11
|
+
const prettyName = $(element).text().trim();
|
|
12
|
+
if (!path || !prettyName)
|
|
13
|
+
return null;
|
|
14
|
+
return {
|
|
15
|
+
name: path.split("/").filter(Boolean).pop(),
|
|
16
|
+
prettyName,
|
|
17
|
+
path,
|
|
18
|
+
};
|
|
19
|
+
})
|
|
20
|
+
.get()
|
|
21
|
+
.filter(Boolean);
|
|
22
|
+
}
|
|
23
|
+
static parsePostalCodes($, config) {
|
|
24
|
+
const codes = {};
|
|
25
|
+
$(".codes .container").each((_i, element) => {
|
|
26
|
+
const place = $(element).find(".place").text().trim();
|
|
27
|
+
const codesList = $(element)
|
|
28
|
+
.find(".code span")
|
|
29
|
+
.map((_j, el) => $(el).text().trim())
|
|
30
|
+
.get();
|
|
31
|
+
if (place) {
|
|
32
|
+
const key = config.usePrettyName ? place : place.toLowerCase().replace(/\s+/g, "-");
|
|
33
|
+
codes[key] = codesList;
|
|
34
|
+
}
|
|
35
|
+
});
|
|
36
|
+
return codes;
|
|
37
|
+
}
|
|
38
|
+
static parseCountries($, config) {
|
|
39
|
+
return $(".regions div a")
|
|
40
|
+
.map((_i, element) => {
|
|
41
|
+
const path = $(element).attr("href");
|
|
42
|
+
return path ? { name: path.replace(/\//g, ""), prettyName: $(element).text().trim(), path } : null;
|
|
43
|
+
})
|
|
44
|
+
.get()
|
|
45
|
+
.filter(Boolean);
|
|
46
|
+
}
|
|
47
|
+
static parseCountryByName($, config, name) {
|
|
48
|
+
const countryElement = $(`.regions div a`).filter((_, el) => { var _a; return ((_a = $(el).attr("href")) === null || _a === void 0 ? void 0 : _a.replace(/\//g, "")) === name.toLowerCase().trim(); });
|
|
49
|
+
if (!countryElement.length)
|
|
50
|
+
return null;
|
|
51
|
+
const path = countryElement.attr("href");
|
|
52
|
+
const prettyName = countryElement.text().trim();
|
|
53
|
+
return path && prettyName
|
|
54
|
+
? {
|
|
55
|
+
name: path.replace(/\//g, ""),
|
|
56
|
+
prettyName,
|
|
57
|
+
path,
|
|
58
|
+
}
|
|
59
|
+
: null;
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
exports.Parser = Parser;
|
|
63
|
+
//# sourceMappingURL=parsers.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"parsers.js","sourceRoot":"","sources":["../../../../src/scraper/parsers.ts"],"names":[],"mappings":";;;AAEA,MAAa,MAAM;IAClB,MAAM,CAAC,YAAY,CAAC,CAAe,EAAE,MAAqB;QACzD,OAAO,CAAC,CAAC,wBAAwB,CAAC;aAChC,IAAI,CAAC,UAAU,CAAC;aAChB,IAAI,CAAC,GAAG,CAAC;aACT,GAAG,CAAC,CAAC,MAAM,EAAE,OAAO,EAAE,EAAE;YACxB,MAAM,IAAI,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;YACrC,MAAM,UAAU,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;YAC5C,IAAI,CAAC,IAAI,IAAI,CAAC,UAAU;gBAAE,OAAO,IAAI,CAAC;YAEtC,OAAO;gBACN,IAAI,EAAE,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,GAAG,EAAG;gBAC5C,UAAU;gBACV,IAAI;aACJ,CAAC;QACH,CAAC,CAAC;aACD,GAAG,EAAE;aACL,MAAM,CAAC,OAAO,CAAa,CAAC;IAC/B,CAAC;IAED,MAAM,CAAC,gBAAgB,CAAC,CAAe,EAAE,MAAqB;QAC7D,MAAM,KAAK,GAA6B,EAAE,CAAC;QAE3C,CAAC,CAAC,mBAAmB,CAAC,CAAC,IAAI,CAAC,CAAC,EAAE,EAAE,OAAO,EAAE,EAAE;YAC3C,MAAM,KAAK,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;YACtD,MAAM,SAAS,GAAG,CAAC,CAAC,OAAO,CAAC;iBAC1B,IAAI,CAAC,YAAY,CAAC;iBAClB,GAAG,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;iBACpC,GAAG,EAAE,CAAC;YAER,IAAI,KAAK,EAAE,CAAC;gBACX,MAAM,GAAG,GAAG,MAAM,CAAC,aAAa,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,WAAW,EAAE,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;gBACpF,KAAK,CAAC,GAAG,CAAC,GAAG,SAAS,CAAC;YACxB,CAAC;QACF,CAAC,CAAC,CAAC;QAEH,OAAO,KAAK,CAAC;IACd,CAAC;IAED,MAAM,CAAC,cAAc,CAAC,CAAe,EAAE,MAAqB;QAC3D,OAAO,CAAC,CAAC,gBAAgB,CAAC;aACxB,GAAG,CAAC,CAAC,EAAE,EAAE,OAAO,EAAE,EAAE;YACpB,MAAM,IAAI,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;YACrC,OAAO,IAAI,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,IAAI,CAAC,OAAO,CAAC,KAAK,EAAE,EAAE,CAAC,EAAE,UAAU,EAAE,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC;QACpG,CAAC,CAAC;aACD,GAAG,EAAE;aACL,MAAM,CAAC,OAAO,CAAa,CAAC;IAC/B,CAAC;IAED,MAAM,CAAC,kBAAkB,CAAC,CAAe,EAAE,MAAqB,EAAE,IAAY;QAC7E,MAAM,cAAc,GAAG,CAAC,CAAC,gBAAgB,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE,WAAC,OAAA,CAAA,MAAA,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,0CAAE,OAAO,CAAC,KAAK,EAAE,EAAE,CAAC,MAAK,IAAI,CAAC,WAAW,EAAE,CAAC,IAAI,EAAE,CAAA,EAAA,CAAC,CAAC;QAEnI,IAAI,CAAC,cAAc,CAAC,MAAM;YAAE,OAAO,IAAI,CAAC;QAExC,MAAM,IAAI,GAAG,cAAc,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;QACzC,MAAM,UAAU,GAAG,cAAc,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;QAChD,OAAO,IAAI,IAAI,UAAU;YACxB,CAAC,CAAC;gBACA,IAAI,EAAE,IAAI,CAAC,OAAO,CAAC,KAAK,EAAE,EAAE,CAAC;gBAC7B,UAAU;gBACV,IAAI;aACH;YACH,CAAC,CAAC,IAAI,CAAC;IACT,CAAC;CACD;AAhED,wBAgEC"}
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
3
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
4
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
5
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
6
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
7
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
8
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
9
|
+
});
|
|
10
|
+
};
|
|
11
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
12
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
13
|
+
};
|
|
14
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
15
|
+
exports.ProcessingQueue = void 0;
|
|
16
|
+
const cheerio_1 = require("cheerio");
|
|
17
|
+
const p_limit_1 = __importDefault(require("p-limit"));
|
|
18
|
+
const parsers_1 = require("./parsers");
|
|
19
|
+
class ProcessingQueue {
|
|
20
|
+
constructor(fetcher, config) {
|
|
21
|
+
this.fetcher = fetcher;
|
|
22
|
+
this.config = config;
|
|
23
|
+
this.queue = [];
|
|
24
|
+
this.visitedUrls = new Set();
|
|
25
|
+
this.limit = (0, p_limit_1.default)(config.concurrency || 15);
|
|
26
|
+
}
|
|
27
|
+
process(startRegion, data) {
|
|
28
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
29
|
+
this.queue.push({ region: startRegion, currData: data });
|
|
30
|
+
while (this.queue.length > 0) {
|
|
31
|
+
const tasks = this.queue.map((item) => this.limit(() => this.processItem(item)));
|
|
32
|
+
this.queue = [];
|
|
33
|
+
yield Promise.all(tasks);
|
|
34
|
+
}
|
|
35
|
+
});
|
|
36
|
+
}
|
|
37
|
+
processItem(item) {
|
|
38
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
39
|
+
var _a, _b;
|
|
40
|
+
const url = `${this.config.baseUrl}${item.region.path}`;
|
|
41
|
+
if (this.visitedUrls.has(url))
|
|
42
|
+
return;
|
|
43
|
+
this.visitedUrls.add(url);
|
|
44
|
+
(_a = this.config.logger) === null || _a === void 0 ? void 0 : _a.info(`Fetching: ${url}`);
|
|
45
|
+
try {
|
|
46
|
+
const html = yield this.fetcher.fetchWithRetry(url);
|
|
47
|
+
const $ = (0, cheerio_1.load)(html);
|
|
48
|
+
// Parse and add new regions to queue
|
|
49
|
+
const regions = parsers_1.Parser.parseRegions($, this.config);
|
|
50
|
+
regions.forEach((region) => {
|
|
51
|
+
const key = this.config.usePrettyName ? region.prettyName : region.name;
|
|
52
|
+
item.currData[key] = {};
|
|
53
|
+
this.queue.push({
|
|
54
|
+
region,
|
|
55
|
+
currData: item.currData[key],
|
|
56
|
+
});
|
|
57
|
+
});
|
|
58
|
+
// Parse postal codes
|
|
59
|
+
const codes = parsers_1.Parser.parsePostalCodes($, this.config);
|
|
60
|
+
Object.assign(item.currData, codes);
|
|
61
|
+
}
|
|
62
|
+
catch (error) {
|
|
63
|
+
(_b = this.config.logger) === null || _b === void 0 ? void 0 : _b.error(`Error processing ${url}:`, error);
|
|
64
|
+
}
|
|
65
|
+
});
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
exports.ProcessingQueue = ProcessingQueue;
|
|
69
|
+
//# sourceMappingURL=queue.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"queue.js","sourceRoot":"","sources":["../../../../src/scraper/queue.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;AAAA,qCAA+B;AAG/B,sDAA6B;AAC7B,uCAAmC;AAEnC,MAAa,eAAe;IAK3B,YAAoB,OAAgB,EAAU,MAAqB;QAA/C,YAAO,GAAP,OAAO,CAAS;QAAU,WAAM,GAAN,MAAM,CAAe;QAJ3D,UAAK,GAA0B,EAAE,CAAC;QAClC,gBAAW,GAAG,IAAI,GAAG,EAAU,CAAC;QAIvC,IAAI,CAAC,KAAK,GAAG,IAAA,iBAAM,EAAC,MAAM,CAAC,WAAW,IAAI,EAAE,CAAC,CAAC;IAC/C,CAAC;IAEK,OAAO,CAAC,WAAmB,EAAE,IAAS;;YAC3C,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,EAAE,MAAM,EAAE,WAAW,EAAE,QAAQ,EAAE,IAAI,EAAE,CAAC,CAAC;YAEzD,OAAO,IAAI,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBAC9B,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,KAAK,CAAC,GAAG,EAAE,CAAC,IAAI,CAAC,WAAW,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;gBACjF,IAAI,CAAC,KAAK,GAAG,EAAE,CAAC;gBAChB,MAAM,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC;YAC1B,CAAC;QACF,CAAC;KAAA;IAEa,WAAW,CAAC,IAAyB;;;YAClD,MAAM,GAAG,GAAG,GAAG,IAAI,CAAC,MAAM,CAAC,OAAO,GAAG,IAAI,CAAC,MAAM,CAAC,IAAI,EAAE,CAAC;YAExD,IAAI,IAAI,CAAC,WAAW,CAAC,GAAG,CAAC,GAAG,CAAC;gBAAE,OAAO;YACtC,IAAI,CAAC,WAAW,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;YAE1B,MAAA,IAAI,CAAC,MAAM,CAAC,MAAM,0CAAE,IAAI,CAAC,aAAa,GAAG,EAAE,CAAC,CAAC;YAE7C,IAAI,CAAC;gBACJ,MAAM,IAAI,GAAG,MAAM,IAAI,CAAC,OAAO,CAAC,cAAc,CAAC,GAAG,CAAC,CAAC;gBACpD,MAAM,CAAC,GAAG,IAAA,cAAI,EAAC,IAAI,CAAC,CAAC;gBAErB,qCAAqC;gBACrC,MAAM,OAAO,GAAG,gBAAM,CAAC,YAAY,CAAC,CAAC,EAAE,IAAI,CAAC,MAAM,CAAC,CAAC;gBACpD,OAAO,CAAC,OAAO,CAAC,CAAC,MAAM,EAAE,EAAE;oBAC1B,MAAM,GAAG,GAAG,IAAI,CAAC,MAAM,CAAC,aAAa,CAAC,CAAC,CAAC,MAAM,CAAC,UAAU,CAAC,CAAC,CAAC,MAAM,CAAC,IAAI,CAAC;oBACxE,IAAI,CAAC,QAAQ,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC;oBACxB,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC;wBACf,MAAM;wBACN,QAAQ,EAAE,IAAI,CAAC,QAAQ,CAAC,GAAG,CAAC;qBAC5B,CAAC,CAAC;gBACJ,CAAC,CAAC,CAAC;gBAEH,qBAAqB;gBACrB,MAAM,KAAK,GAAG,gBAAM,CAAC,gBAAgB,CAAC,CAAC,EAAE,IAAI,CAAC,MAAM,CAAC,CAAC;gBACtD,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,QAAQ,EAAE,KAAK,CAAC,CAAC;YACrC,CAAC;YAAC,OAAO,KAAK,EAAE,CAAC;gBAChB,MAAA,IAAI,CAAC,MAAM,CAAC,MAAM,0CAAE,KAAK,CAAC,oBAAoB,GAAG,GAAG,EAAE,KAAK,CAAC,CAAC;YAC9D,CAAC;QACF,CAAC;KAAA;CACD;AAjDD,0CAiDC"}
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
3
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
4
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
5
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
6
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
7
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
8
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
9
|
+
});
|
|
10
|
+
};
|
|
11
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
12
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
13
|
+
};
|
|
14
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
15
|
+
exports.PostalCodeScraper = void 0;
|
|
16
|
+
const puppeteer_1 = __importDefault(require("puppeteer"));
|
|
17
|
+
const queue_1 = require("./queue");
|
|
18
|
+
const fetchers_1 = require("./fetchers");
|
|
19
|
+
const id_generator_1 = require("../utils/id-generator");
|
|
20
|
+
const fs_1 = require("fs");
|
|
21
|
+
const path_1 = __importDefault(require("path"));
|
|
22
|
+
const cheerio_1 = require("cheerio");
|
|
23
|
+
const parsers_1 = require("./parsers");
|
|
24
|
+
class PostalCodeScraper {
|
|
25
|
+
constructor(config = {}) {
|
|
26
|
+
this.config = config;
|
|
27
|
+
this.config = Object.assign({ baseUrl: "https://worldpostalcode.com", concurrency: 15, maxRetries: 5, headless: true, directory: "src/data" }, config);
|
|
28
|
+
}
|
|
29
|
+
scrapeCountry(countryName) {
|
|
30
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
31
|
+
var _a;
|
|
32
|
+
yield this.initBrowser();
|
|
33
|
+
const country = yield this.getCountryDetails(countryName);
|
|
34
|
+
if (!country) {
|
|
35
|
+
(_a = this.config.logger) === null || _a === void 0 ? void 0 : _a.warn(`Country not found: ${countryName}`);
|
|
36
|
+
return null;
|
|
37
|
+
}
|
|
38
|
+
const data = {};
|
|
39
|
+
yield this.queue.process(country, data);
|
|
40
|
+
this.saveData(data, `${country.name}-postal-codes.json`, this.config.directory);
|
|
41
|
+
const postalCodeLookup = this.generatePostalCodeLookup(data);
|
|
42
|
+
this.saveData(postalCodeLookup, `${country.name}-lookup.json`, this.config.directory);
|
|
43
|
+
yield this.cleanup();
|
|
44
|
+
});
|
|
45
|
+
}
|
|
46
|
+
scrapeCountries() {
|
|
47
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
48
|
+
var _a, _b;
|
|
49
|
+
yield this.initBrowser();
|
|
50
|
+
const countries = yield this.getCountriesDetails();
|
|
51
|
+
if (countries.length === 0) {
|
|
52
|
+
(_a = this.config.logger) === null || _a === void 0 ? void 0 : _a.warn("No countries found.");
|
|
53
|
+
return null;
|
|
54
|
+
}
|
|
55
|
+
for (const country of countries) {
|
|
56
|
+
const key = this.config.usePrettyName ? country.prettyName : country.name;
|
|
57
|
+
const countryData = {};
|
|
58
|
+
(_b = this.config.logger) === null || _b === void 0 ? void 0 : _b.info(`Processing country: ${key}`);
|
|
59
|
+
yield this.queue.process(country, countryData);
|
|
60
|
+
this.saveData(countryData, `${key}-postal-codes.json`, this.config.directory);
|
|
61
|
+
const postalCodeLookup = this.generatePostalCodeLookup(countryData);
|
|
62
|
+
this.saveData(postalCodeLookup, `${key}-lookup.json`, this.config.directory);
|
|
63
|
+
}
|
|
64
|
+
yield this.cleanup();
|
|
65
|
+
});
|
|
66
|
+
}
|
|
67
|
+
initBrowser() {
|
|
68
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
69
|
+
this.browser = yield puppeteer_1.default.launch({ headless: this.config.headless });
|
|
70
|
+
this.fetcher = new fetchers_1.Fetcher(this.browser, this.config);
|
|
71
|
+
this.queue = new queue_1.ProcessingQueue(this.fetcher, this.config);
|
|
72
|
+
});
|
|
73
|
+
}
|
|
74
|
+
getCountryDetails(name) {
|
|
75
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
76
|
+
var _a;
|
|
77
|
+
try {
|
|
78
|
+
const html = yield this.fetcher.fetchWithRetry(this.config.baseUrl);
|
|
79
|
+
return parsers_1.Parser.parseCountryByName((0, cheerio_1.load)(html), this.config, name);
|
|
80
|
+
}
|
|
81
|
+
catch (error) {
|
|
82
|
+
(_a = this.config.logger) === null || _a === void 0 ? void 0 : _a.error(`Error fetching country details: ${name}`, error);
|
|
83
|
+
return null;
|
|
84
|
+
}
|
|
85
|
+
});
|
|
86
|
+
}
|
|
87
|
+
getCountriesDetails() {
|
|
88
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
89
|
+
var _a;
|
|
90
|
+
try {
|
|
91
|
+
const html = yield this.fetcher.fetchWithRetry(this.config.baseUrl);
|
|
92
|
+
return parsers_1.Parser.parseCountries((0, cheerio_1.load)(html), this.config);
|
|
93
|
+
}
|
|
94
|
+
catch (error) {
|
|
95
|
+
(_a = this.config.logger) === null || _a === void 0 ? void 0 : _a.error("Error fetching countries details", error);
|
|
96
|
+
return [];
|
|
97
|
+
}
|
|
98
|
+
});
|
|
99
|
+
}
|
|
100
|
+
generatePostalCodeLookup(data) {
|
|
101
|
+
return this.buildLookup(data, (0, id_generator_1.createRegionIdGenerator)());
|
|
102
|
+
}
|
|
103
|
+
buildLookup(regionObj, idGenerator, acc = [], result = { postalCodeMap: {}, regions: {} }) {
|
|
104
|
+
if (Array.isArray(regionObj)) {
|
|
105
|
+
for (const item of regionObj) {
|
|
106
|
+
const id = idGenerator(acc);
|
|
107
|
+
result.postalCodeMap[item] = id;
|
|
108
|
+
result.regions[id] = [...acc];
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
else if (typeof regionObj === "object" && regionObj !== null) {
|
|
112
|
+
for (const [regionKey, regionValue] of Object.entries(regionObj)) {
|
|
113
|
+
this.buildLookup(regionValue, idGenerator, [...acc, regionKey], result);
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
return result;
|
|
117
|
+
}
|
|
118
|
+
normalizeString(str) {
|
|
119
|
+
return str
|
|
120
|
+
.trim()
|
|
121
|
+
.toLowerCase()
|
|
122
|
+
.normalize("NFD")
|
|
123
|
+
.replace(/[\u0300-\u036f]/g, "")
|
|
124
|
+
.replace(/\s+/g, "-")
|
|
125
|
+
.replace(/[^a-z0-9.-]/g, "");
|
|
126
|
+
}
|
|
127
|
+
saveData(data, fileName, directory = "src/data") {
|
|
128
|
+
var _a, _b;
|
|
129
|
+
try {
|
|
130
|
+
(0, fs_1.mkdirSync)(directory, { recursive: true });
|
|
131
|
+
const filePath = path_1.default.join(directory, this.normalizeString(fileName));
|
|
132
|
+
(0, fs_1.writeFileSync)(filePath, JSON.stringify(data, null, 2), { flag: "w" });
|
|
133
|
+
(_a = this.config.logger) === null || _a === void 0 ? void 0 : _a.info(`Saved data to ${filePath}`);
|
|
134
|
+
}
|
|
135
|
+
catch (error) {
|
|
136
|
+
(_b = this.config.logger) === null || _b === void 0 ? void 0 : _b.error(`Error saving data to ${fileName}`, error);
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
cleanup() {
|
|
140
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
141
|
+
var _a;
|
|
142
|
+
yield ((_a = this.browser) === null || _a === void 0 ? void 0 : _a.close());
|
|
143
|
+
});
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
exports.PostalCodeScraper = PostalCodeScraper;
|
|
147
|
+
exports.default = new PostalCodeScraper();
|
|
148
|
+
//# sourceMappingURL=scrapers.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"scrapers.js","sourceRoot":"","sources":["../../../../src/scraper/scrapers.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;AAAA,0DAA+C;AAC/C,mCAA0C;AAC1C,yCAAqC;AAErC,wDAAmF;AACnF,2BAA8C;AAC9C,gDAAwB;AACxB,qCAA+B;AAC/B,uCAAmC;AAEnC,MAAa,iBAAiB;IAK7B,YAAoB,SAAwB,EAAE;QAA1B,WAAM,GAAN,MAAM,CAAoB;QAC7C,IAAI,CAAC,MAAM,mBACV,OAAO,EAAE,6BAA6B,EACtC,WAAW,EAAE,EAAE,EACf,UAAU,EAAE,CAAC,EACb,QAAQ,EAAE,IAAI,EACd,SAAS,EAAE,UAAU,IAClB,MAAM,CACT,CAAC;IACH,CAAC;IAEK,aAAa,CAAC,WAAmB;;;YACtC,MAAM,IAAI,CAAC,WAAW,EAAE,CAAC;YACzB,MAAM,OAAO,GAAG,MAAM,IAAI,CAAC,iBAAiB,CAAC,WAAW,CAAC,CAAC;YAC1D,IAAI,CAAC,OAAO,EAAE,CAAC;gBACd,MAAA,IAAI,CAAC,MAAM,CAAC,MAAM,0CAAE,IAAI,CAAC,sBAAsB,WAAW,EAAE,CAAC,CAAC;gBAC9D,OAAO,IAAI,CAAC;YACb,CAAC;YAED,MAAM,IAAI,GAAwB,EAAE,CAAC;YACrC,MAAM,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,OAAO,EAAE,IAAI,CAAC,CAAC;YAExC,IAAI,CAAC,QAAQ,CAAC,IAAI,EAAE,GAAG,OAAO,CAAC,IAAI,oBAAoB,EAAE,IAAI,CAAC,MAAM,CAAC,SAAU,CAAC,CAAC;YAEjF,MAAM,gBAAgB,GAAG,IAAI,CAAC,wBAAwB,CAAC,IAAI,CAAC,CAAC;YAC7D,IAAI,CAAC,QAAQ,CAAC,gBAAgB,EAAE,GAAG,OAAO,CAAC,IAAI,cAAc,EAAE,IAAI,CAAC,MAAM,CAAC,SAAU,CAAC,CAAC;YAEvF,MAAM,IAAI,CAAC,OAAO,EAAE,CAAC;QACtB,CAAC;KAAA;IAEK,eAAe;;;YACpB,MAAM,IAAI,CAAC,WAAW,EAAE,CAAC;YACzB,MAAM,SAAS,GAAG,MAAM,IAAI,CAAC,mBAAmB,EAAE,CAAC;YACnD,IAAI,SAAS,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;gBAC5B,MAAA,IAAI,CAAC,MAAM,CAAC,MAAM,0CAAE,IAAI,CAAC,qBAAqB,CAAC,CAAC;gBAChD,OAAO,IAAI,CAAC;YACb,CAAC;YAED,KAAK,MAAM,OAAO,IAAI,SAAS,EAAE,CAAC;gBACjC,MAAM,GAAG,GAAG,IAAI,CAAC,MAAM,CAAC,aAAa,CAAC,CAAC,CAAC,OAAO,CAAC,UAAU,CAAC,CAAC,CAAC,OAAO,CAAC,IAAI,CAAC;gBAC1E,MAAM,WAAW,GAAwB,EAAE,CAAC;gBAC5C,MAAA,IAAI,CAAC,MAAM,CAAC,MAAM,0CAAE,IAAI,CAAC,uBAAuB,GAAG,EAAE,CAAC,CAAC;gBAEvD,MAAM,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,OAAO,EAAE,WAAW,CAAC,CAAC;gBAC/C,IAAI,CAAC,QAAQ,CAAC,WAAW,EAAE,GAAG,GAAG,oBAAoB,EAAE,IAAI,CAAC,MAAM,CAAC,SAAU,CAAC,CAAC;gBAE/E,MAAM,gBAAgB,GAAG,IAAI,CAAC,wBAAwB,CAAC,WAAW,CAAC,CAAC;gBACpE,IAAI,CAAC,QAAQ,CAAC,gBAAgB,EAAE,GAAG,GAAG,cAAc,EAAE,IAAI,CAAC,MAAM,CAAC,SAAU,CAAC,CAAC;YAC/E,CAAC;YACD,MAAM,IAAI,CAAC,OAAO,EAAE,CAAC;QACtB,CAAC;KAAA;IAEa,WAAW;;YACxB,IAAI,CAAC,OAAO,GAAG,MAAM,mBAAS,CAAC,MAAM,CAAC,EAAE,QAAQ,EAAE,IAAI,CAAC,MAAM,CAAC,QAAQ,EAAE,CAAC,CAAC;YAC1E,IAAI,CAAC,OAAO,GAAG,IAAI,kBAAO,CAAC,IAAI,CAAC,OAAO,EAAE,IAAI,CAAC,MAAM,CAAC,CAAC;YACtD,IAAI,CAAC,KAAK,GAAG,IAAI,uBAAe,CAAC,IAAI,CAAC,OAAO,EAAE,IAAI,CAAC,MAAM,CAAC,CAAC;QAC7D,CAAC;KAAA;IAEa,iBAAiB,CAAC,IAAY;;;YAC3C,IAAI,CAAC;gBACJ,MAAM,IAAI,GAAG,MAAM,IAAI,CAAC,OAAO,CAAC,cAAc,CAAC,IAAI,CAAC,MAAM,CAAC,OAAQ,CAAC,CAAC;gBACrE,OAAO,gBAAM,CAAC,kBAAkB,CAAC,IAAA,cAAI,EAAC,IAAI,CAAC,EAAE,IAAI,CAAC,MAAM,EAAE,IAAI,CAAC,CAAC;YACjE,CAAC;YAAC,OAAO,KAAK,EAAE,CAAC;gBAChB,MAAA,IAAI,CAAC,MAAM,CAAC,MAAM,0CAAE,KAAK,CAAC,mCAAmC,IAAI,EAAE,EAAE,KAAK,CAAC,CAAC;gBAC5E,OAAO,IAAI,CAAC;YACb,CAAC;QACF,CAAC;KAAA;IAEa,mBAAmB;;;YAChC,IAAI,CAAC;gBACJ,MAAM,IAAI,GAAG,MAAM,IAAI,CAAC,OAAO,CAAC,cAAc,CAAC,IAAI,CAAC,MAAM,CAAC,OAAQ,CAAC,CAAC;gBACrE,OAAO,gBAAM,CAAC,cAAc,CAAC,IAAA,cAAI,EAAC,IAAI,CAAC,EAAE,IAAI,CAAC,MAAM,CAAC,CAAC;YACvD,CAAC;YAAC,OAAO,KAAK,EAAE,CAAC;gBAChB,MAAA,IAAI,CAAC,MAAM,CAAC,MAAM,0CAAE,KAAK,CAAC,kCAAkC,EAAE,KAAK,CAAC,CAAC;gBACrE,OAAO,EAAE,CAAC;YACX,CAAC;QACF,CAAC;KAAA;IAEO,wBAAwB,CAAC,IAAS;QACzC,OAAO,IAAI,CAAC,WAAW,CAAC,IAAI,EAAE,IAAA,sCAAuB,GAAE,CAAC,CAAC;IAC1D,CAAC;IAEO,WAAW,CAClB,SAAc,EACd,WAA8B,EAC9B,MAAgB,EAAE,EAClB,SAA2B,EAAE,aAAa,EAAE,EAAE,EAAE,OAAO,EAAE,EAAE,EAAE;QAE7D,IAAI,KAAK,CAAC,OAAO,CAAC,SAAS,CAAC,EAAE,CAAC;YAC9B,KAAK,MAAM,IAAI,IAAI,SAAS,EAAE,CAAC;gBAC9B,MAAM,EAAE,GAAG,WAAW,CAAC,GAAG,CAAC,CAAC;gBAC5B,MAAM,CAAC,aAAa,CAAC,IAAI,CAAC,GAAG,EAAE,CAAC;gBAChC,MAAM,CAAC,OAAO,CAAC,EAAE,CAAC,GAAG,CAAC,GAAG,GAAG,CAAC,CAAC;YAC/B,CAAC;QACF,CAAC;aAAM,IAAI,OAAO,SAAS,KAAK,QAAQ,IAAI,SAAS,KAAK,IAAI,EAAE,CAAC;YAChE,KAAK,MAAM,CAAC,SAAS,EAAE,WAAW,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,SAAS,CAAC,EAAE,CAAC;gBAClE,IAAI,CAAC,WAAW,CAAC,WAAW,EAAE,WAAW,EAAE,CAAC,GAAG,GAAG,EAAE,SAAS,CAAC,EAAE,MAAM,CAAC,CAAC;YACzE,CAAC;QACF,CAAC;QACD,OAAO,MAAM,CAAC;IACf,CAAC;IAEO,eAAe,CAAC,GAAW;QAClC,OAAO,GAAG;aACR,IAAI,EAAE;aACN,WAAW,EAAE;aACb,SAAS,CAAC,KAAK,CAAC;aAChB,OAAO,CAAC,kBAAkB,EAAE,EAAE,CAAC;aAC/B,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC;aACpB,OAAO,CAAC,cAAc,EAAE,EAAE,CAAC,CAAC;IAC/B,CAAC;IAEO,QAAQ,CAAC,IAAS,EAAE,QAAgB,EAAE,YAAoB,UAAU;;QAC3E,IAAI,CAAC;YACJ,IAAA,cAAS,EAAC,SAAS,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;YAC1C,MAAM,QAAQ,GAAG,cAAI,CAAC,IAAI,CAAC,SAAS,EAAE,IAAI,CAAC,eAAe,CAAC,QAAQ,CAAC,CAAC,CAAC;YACtE,IAAA,kBAAa,EAAC,QAAQ,EAAE,IAAI,CAAC,SAAS,CAAC,IAAI,EAAE,IAAI,EAAE,CAAC,CAAC,EAAE,EAAE,IAAI,EAAE,GAAG,EAAE,CAAC,CAAC;YACtE,MAAA,IAAI,CAAC,MAAM,CAAC,MAAM,0CAAE,IAAI,CAAC,iBAAiB,QAAQ,EAAE,CAAC,CAAC;QACvD,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YAChB,MAAA,IAAI,CAAC,MAAM,CAAC,MAAM,0CAAE,KAAK,CAAC,wBAAwB,QAAQ,EAAE,EAAE,KAAK,CAAC,CAAC;QACtE,CAAC;IACF,CAAC;IAEK,OAAO;;;YACZ,MAAM,CAAA,MAAA,IAAI,CAAC,OAAO,0CAAE,KAAK,EAAE,CAAA,CAAC;QAC7B,CAAC;KAAA;CACD;AAnID,8CAmIC;AAED,kBAAe,IAAI,iBAAiB,EAAE,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.js","sourceRoot":"","sources":["../../../src/types.ts"],"names":[],"mappings":""}
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.createRegionIdGenerator = void 0;
|
|
4
|
+
const createRegionIdGenerator = () => {
|
|
5
|
+
const regionRegistry = new Map();
|
|
6
|
+
const counterMap = new Map();
|
|
7
|
+
return (regions) => {
|
|
8
|
+
// Normalize region names (case/diacritic/space insensitive)
|
|
9
|
+
const normalized = regions.map((region) => region
|
|
10
|
+
.trim()
|
|
11
|
+
.toLowerCase()
|
|
12
|
+
.normalize("NFD")
|
|
13
|
+
.replace(/[\u0300-\u036f]/g, "") // Remove diacritics
|
|
14
|
+
.replace(/\s+/g, "_"));
|
|
15
|
+
// Create a unique composite key for the region hierarchy
|
|
16
|
+
const compositeKey = normalized.join("|");
|
|
17
|
+
// Return existing ID if already registered
|
|
18
|
+
if (regionRegistry.has(compositeKey)) {
|
|
19
|
+
return regionRegistry.get(compositeKey);
|
|
20
|
+
}
|
|
21
|
+
// Generate new base name from the last region (typically city name)
|
|
22
|
+
const baseName = normalized[normalized.length - 1];
|
|
23
|
+
const count = (counterMap.get(baseName) || 0) + 1;
|
|
24
|
+
counterMap.set(baseName, count);
|
|
25
|
+
// Create ID with format: {city}_{uniqueCounter}
|
|
26
|
+
const newId = `${baseName}_${count}`;
|
|
27
|
+
// Register the composite key
|
|
28
|
+
regionRegistry.set(compositeKey, newId);
|
|
29
|
+
return newId;
|
|
30
|
+
};
|
|
31
|
+
};
|
|
32
|
+
exports.createRegionIdGenerator = createRegionIdGenerator;
|
|
33
|
+
//# sourceMappingURL=id-generator.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"id-generator.js","sourceRoot":"","sources":["../../../../src/utils/id-generator.ts"],"names":[],"mappings":";;;AAIO,MAAM,uBAAuB,GAAG,GAAsB,EAAE;IAC9D,MAAM,cAAc,GAAG,IAAI,GAAG,EAAkB,CAAC;IACjD,MAAM,UAAU,GAAG,IAAI,GAAG,EAAkB,CAAC;IAE7C,OAAO,CAAC,OAAiB,EAAU,EAAE;QACpC,4DAA4D;QAC5D,MAAM,UAAU,GAAG,OAAO,CAAC,GAAG,CAAC,CAAC,MAAM,EAAE,EAAE,CACzC,MAAM;aACJ,IAAI,EAAE;aACN,WAAW,EAAE;aACb,SAAS,CAAC,KAAK,CAAC;aAChB,OAAO,CAAC,kBAAkB,EAAE,EAAE,CAAC,CAAC,oBAAoB;aACpD,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CACtB,CAAC;QAEF,yDAAyD;QACzD,MAAM,YAAY,GAAG,UAAU,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QAE1C,2CAA2C;QAC3C,IAAI,cAAc,CAAC,GAAG,CAAC,YAAY,CAAC,EAAE,CAAC;YACtC,OAAO,cAAc,CAAC,GAAG,CAAC,YAAY,CAAE,CAAC;QAC1C,CAAC;QAED,oEAAoE;QACpE,MAAM,QAAQ,GAAG,UAAU,CAAC,UAAU,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;QACnD,MAAM,KAAK,GAAG,CAAC,UAAU,CAAC,GAAG,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC;QAClD,UAAU,CAAC,GAAG,CAAC,QAAQ,EAAE,KAAK,CAAC,CAAC;QAEhC,gDAAgD;QAChD,MAAM,KAAK,GAAG,GAAG,QAAQ,IAAI,KAAK,EAAE,CAAC;QAErC,6BAA6B;QAC7B,cAAc,CAAC,GAAG,CAAC,YAAY,EAAE,KAAK,CAAC,CAAC;QAExC,OAAO,KAAK,CAAC;IACd,CAAC,CAAC;AACH,CAAC,CAAC;AApCW,QAAA,uBAAuB,2BAoClC"}
|