cry-bizisi-parser 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.txt +148 -0
- package/cli.ts +58 -0
- package/dist/bizisi.d.ts +29 -0
- package/dist/bizisi.d.ts.map +1 -0
- package/dist/bizisi.js +623 -0
- package/dist/bizisi.js.map +1 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +3 -0
- package/dist/index.js.map +1 -0
- package/dist/types.d.ts +66 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +2 -0
- package/dist/types.js.map +1 -0
- package/package.json +35 -0
- package/src/bizisi.ts +720 -0
- package/src/index.ts +2 -0
- package/src/types.ts +71 -0
package/src/bizisi.ts
ADDED
|
@@ -0,0 +1,720 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* bizi.si scraper — čisti Bun fetch, brez brskalnika.
|
|
3
|
+
*
|
|
4
|
+
* bizi.si uporablja ASP.NET WebForms, zato je treba:
|
|
5
|
+
* 1. Shranjevati piškotke (ASP.NET_SessionId, .ASPXAUTH)
|
|
6
|
+
* 2. Prenašati __VIEWSTATE in __EVENTVALIDATION med GET in POST
|
|
7
|
+
*
|
|
8
|
+
* Odvisnosti: 0 — samo Bun built-in API-ji.
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
import { createWorker, type Worker } from "tesseract.js";
|
|
12
|
+
|
|
13
|
+
import type {
|
|
14
|
+
CompanySearchResult,
|
|
15
|
+
CompanyDetail,
|
|
16
|
+
FinancialData,
|
|
17
|
+
BankAccount,
|
|
18
|
+
KeyPeople,
|
|
19
|
+
ScrapeResult,
|
|
20
|
+
} from "./types.js";
|
|
21
|
+
|
|
22
|
+
// ---------------------------------------------------------------------------
|
|
23
|
+
// Config
|
|
24
|
+
// ---------------------------------------------------------------------------
|
|
25
|
+
|
|
26
|
+
const BASE_URL = "https://www.bizi.si";
|
|
27
|
+
|
|
28
|
+
// ---------------------------------------------------------------------------
|
|
29
|
+
// Cookie jar — ročno upravljanje prek Set-Cookie / Cookie
|
|
30
|
+
// ---------------------------------------------------------------------------
|
|
31
|
+
|
|
32
|
+
class CookieJar {
|
|
33
|
+
private cookies = new Map<string, { value: string; domain: string; path: string }>();
|
|
34
|
+
|
|
35
|
+
/** Prebere Set-Cookie headerje in shrani (Node.js + Bun). */
|
|
36
|
+
setFromHeaders(headers: Headers): void {
|
|
37
|
+
// Node.js 19+ ima getSetCookie(), Bun ima getAll()
|
|
38
|
+
let setCookieValues: string[];
|
|
39
|
+
if (typeof (headers as any).getSetCookie === "function") {
|
|
40
|
+
setCookieValues = (headers as any).getSetCookie();
|
|
41
|
+
} else if (typeof (headers as any).getAll === "function") {
|
|
42
|
+
const raw = (headers as any).getAll("Set-Cookie");
|
|
43
|
+
setCookieValues = Array.isArray(raw) ? raw : [raw];
|
|
44
|
+
} else {
|
|
45
|
+
setCookieValues = [];
|
|
46
|
+
for (const [k, v] of headers as any) {
|
|
47
|
+
if (k.toLowerCase() === "set-cookie") setCookieValues.push(v);
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
for (const entry of setCookieValues) {
|
|
52
|
+
if (!entry) continue;
|
|
53
|
+
const [nv, ...rest] = entry.split(";");
|
|
54
|
+
if (!nv) continue;
|
|
55
|
+
const eq = nv.indexOf("=");
|
|
56
|
+
if (eq < 0) continue;
|
|
57
|
+
const name = nv.substring(0, eq).trim();
|
|
58
|
+
const value = nv.substring(eq + 1).trim();
|
|
59
|
+
|
|
60
|
+
let domain = ".bizi.si";
|
|
61
|
+
let path = "/";
|
|
62
|
+
for (const attr of rest) {
|
|
63
|
+
const a = attr.trim();
|
|
64
|
+
if (a.toLowerCase().startsWith("domain=")) domain = a.split("=")[1] ?? domain;
|
|
65
|
+
if (a.toLowerCase().startsWith("path=")) path = a.split("=")[1] ?? path;
|
|
66
|
+
if (a.toLowerCase() === "max-age=0" || a.toLowerCase().startsWith("expires=")) {
|
|
67
|
+
// Could handle expiry but not critical
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
this.cookies.set(name, { value, domain, path });
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
/** Vrne Cookie header za dani URL. */
|
|
75
|
+
getRequestHeader(): string {
|
|
76
|
+
const parts: string[] = [];
|
|
77
|
+
for (const [name, c] of this.cookies) {
|
|
78
|
+
if (c.domain && ".bizi.si".endsWith(c.domain)) {
|
|
79
|
+
parts.push(`${name}=${c.value}`);
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
return parts.join("; ");
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
/** Izbriši vse piškotke. */
|
|
86
|
+
clear(): void {
|
|
87
|
+
this.cookies.clear();
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
// ---------------------------------------------------------------------------
|
|
92
|
+
// HTTP helper — fetch s piškotki
|
|
93
|
+
// ---------------------------------------------------------------------------
|
|
94
|
+
|
|
95
|
+
async function fetchWithCookies(
|
|
96
|
+
url: string,
|
|
97
|
+
jar: CookieJar,
|
|
98
|
+
init: RequestInit = {},
|
|
99
|
+
): Promise<Response> {
|
|
100
|
+
const headers = new Headers(init.headers);
|
|
101
|
+
// Default headers
|
|
102
|
+
if (!headers.has("User-Agent")) {
|
|
103
|
+
headers.set(
|
|
104
|
+
"User-Agent",
|
|
105
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
|
|
106
|
+
);
|
|
107
|
+
}
|
|
108
|
+
if (!headers.has("Accept")) {
|
|
109
|
+
headers.set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
|
|
110
|
+
}
|
|
111
|
+
if (!headers.has("Accept-Language")) {
|
|
112
|
+
headers.set("Accept-Language", "sl-SI,sl;q=0.9,en;q=0.8");
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
const cookie = jar.getRequestHeader();
|
|
116
|
+
if (cookie) headers.set("Cookie", cookie);
|
|
117
|
+
|
|
118
|
+
if (init.body && !headers.has("Content-Type")) {
|
|
119
|
+
headers.set("Content-Type", "application/x-www-form-urlencoded");
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
const res = await fetch(url, { ...init, headers, redirect: "manual" });
|
|
123
|
+
|
|
124
|
+
// Shrani nove piškotke
|
|
125
|
+
jar.setFromHeaders(res.headers);
|
|
126
|
+
|
|
127
|
+
// Sledi redirectom ročno (da ohranimo piškotke)
|
|
128
|
+
if (res.status >= 300 && res.status < 400) {
|
|
129
|
+
const location = res.headers.get("location");
|
|
130
|
+
if (location) {
|
|
131
|
+
const redirectUrl = location.startsWith("http") ? location : `${new URL(url).origin}${location}`;
|
|
132
|
+
return fetchWithCookies(redirectUrl, jar, { method: "GET" });
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
return res;
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
// ---------------------------------------------------------------------------
|
|
140
|
+
// ASP.NET WebForms helperji
|
|
141
|
+
// ---------------------------------------------------------------------------
|
|
142
|
+
|
|
143
|
+
/** Izlušči __VIEWSTATE, __EVENTVALIDATION itd. iz HTML. */
|
|
144
|
+
function extractAspFormFields(html: string): Record<string, string> {
|
|
145
|
+
const fields: Record<string, string> = {};
|
|
146
|
+
const regex = /<input[^>]*(?:name|id)="(?:__VIEWSTATE|__EVENTVALIDATION|__VIEWSTATEGENERATOR)"[^>]*\/?>/gi;
|
|
147
|
+
let m: RegExpExecArray | null;
|
|
148
|
+
while ((m = regex.exec(html)) !== null) {
|
|
149
|
+
const nameMatch = m[0].match(/name="([^"]+)"/);
|
|
150
|
+
const valueMatch = m[0].match(/value="([^"]*)"/);
|
|
151
|
+
if (nameMatch) {
|
|
152
|
+
fields[nameMatch[1]!] = valueMatch ? valueMatch[1]! : "";
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
return fields;
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
/** Build URL-encoded form body. */
|
|
159
|
+
function encodeForm(data: Record<string, string>): string {
|
|
160
|
+
return Object.entries(data)
|
|
161
|
+
.map(([k, v]) => `${encodeURIComponent(k)}=${encodeURIComponent(v)}`)
|
|
162
|
+
.join("&");
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
// ---------------------------------------------------------------------------
|
|
166
|
+
// Parser helperji
|
|
167
|
+
// ---------------------------------------------------------------------------
|
|
168
|
+
|
|
169
|
+
function normalize(str: string): string {
|
|
170
|
+
return str.replace(/\s+/g, " ").trim();
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
/** Pretvori slovenski datum "25. 3. 2015" v ISO "2015-03-25". */
|
|
174
|
+
function formatDate(raw: string): string {
|
|
175
|
+
const m = raw.match(/(\d{1,2})\.\s*(\d{1,2})\.\s*(\d{4})/);
|
|
176
|
+
if (!m) return raw;
|
|
177
|
+
const d = m[1]!.padStart(2, "0");
|
|
178
|
+
const mo = m[2]!.padStart(2, "0");
|
|
179
|
+
const y = m[3]!;
|
|
180
|
+
return `${y}-${mo}-${d}`;
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
function parseEmployees(raw: string): number {
|
|
184
|
+
const s = raw.trim();
|
|
185
|
+
if (!s || /^(ni|brez|ne|\/)/i.test(s)) return 0;
|
|
186
|
+
const range = s.match(/(\d+)\s*(?:ali|do|-|–|,)\s*(\d+)/i);
|
|
187
|
+
if (range) return (parseInt(range[1] ?? "0", 10) + parseInt(range[2] ?? "0", 10)) / 2;
|
|
188
|
+
const single = s.match(/(\d+)/);
|
|
189
|
+
return single ? parseInt(single[1] ?? "0", 10) : 0;
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
function parseSlovenianNumber(raw: string | null): number | null {
|
|
193
|
+
if (!raw) return null;
|
|
194
|
+
let s = raw.trim();
|
|
195
|
+
if (s === "" || s === "-") return null;
|
|
196
|
+
if (s === "0") return 0;
|
|
197
|
+
s = s.replace(/\./g, "").replace(/,/, ".");
|
|
198
|
+
const n = parseFloat(s);
|
|
199
|
+
return Number.isFinite(n) ? n : null;
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
/** Poišči besedilo med dvema oznakama v HTML (brez HTML tagov). */
|
|
203
|
+
function extractBetween(html: string, startMarker: string, endMarker: string): string {
|
|
204
|
+
const s = html.indexOf(startMarker);
|
|
205
|
+
if (s < 0) return "";
|
|
206
|
+
const start = s + startMarker.length;
|
|
207
|
+
const e = endMarker ? html.indexOf(endMarker, start) : html.length;
|
|
208
|
+
if (e < 0) return html.substring(start).trim();
|
|
209
|
+
return html.substring(start, e).trim();
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
/** Odstrani HTML tage. */
|
|
213
|
+
function stripTags(html: string): string {
|
|
214
|
+
return html.replace(/<[^>]+>/g, " ").replace(/ /g, " ").replace(/&/g, "&");
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
/** Poišči tabelo v HTML in vrni vrstice kot array celic. */
|
|
218
|
+
function parseTable(html: string, tableIdx = 0): string[][] {
|
|
219
|
+
const tables: string[][] = [];
|
|
220
|
+
const tableRegex = /<table[^>]*>([\s\S]*?)<\/table>/gi;
|
|
221
|
+
let tMatch: RegExpExecArray | null;
|
|
222
|
+
while ((tMatch = tableRegex.exec(html)) !== null) {
|
|
223
|
+
const rows: string[][] = [];
|
|
224
|
+
const rowRegex = /<tr[^>]*>([\s\S]*?)<\/tr>/gi;
|
|
225
|
+
let rMatch: RegExpExecArray | null;
|
|
226
|
+
while ((rMatch = rowRegex.exec(tMatch[1]!)) !== null) {
|
|
227
|
+
const cells: string[] = [];
|
|
228
|
+
const cellRegex = /<t[dh][^>]*>([\s\S]*?)<\/t[dh]>/gi;
|
|
229
|
+
let cMatch: RegExpExecArray | null;
|
|
230
|
+
while ((cMatch = cellRegex.exec(rMatch[1]!)) !== null) {
|
|
231
|
+
cells.push(normalize(stripTags(cMatch[1]!)));
|
|
232
|
+
}
|
|
233
|
+
if (cells.length > 0) rows.push(cells);
|
|
234
|
+
}
|
|
235
|
+
if (rows.length > 0) tables.push(rows.flat());
|
|
236
|
+
}
|
|
237
|
+
return tables;
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
// ---------------------------------------------------------------------------
|
|
241
|
+
// FetchBizisiScraper
|
|
242
|
+
// ---------------------------------------------------------------------------
|
|
243
|
+
|
|
244
|
+
export class BizisiScraper {
|
|
245
|
+
private jar = new CookieJar();
|
|
246
|
+
private _loggedIn = false;
|
|
247
|
+
private ocrWorker: Worker | null = null;
|
|
248
|
+
|
|
249
|
+
/** Vrne (ali ustvari) OCR worker. */
|
|
250
|
+
async #getOcrWorker(): Promise<Worker> {
|
|
251
|
+
if (!this.ocrWorker) {
|
|
252
|
+
this.ocrWorker = await createWorker("slv", 1, {
|
|
253
|
+
logger: () => {}, // utišaj log
|
|
254
|
+
});
|
|
255
|
+
}
|
|
256
|
+
return this.ocrWorker;
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
/** Počisti OCR worker (pokliči ko končaš). */
|
|
260
|
+
async close(): Promise<void> {
|
|
261
|
+
if (this.ocrWorker) {
|
|
262
|
+
try { await this.ocrWorker.terminate(); } catch {}
|
|
263
|
+
this.ocrWorker = null;
|
|
264
|
+
}
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
/** OCR ene slike — vrne prepoznano besedilo. */
|
|
268
|
+
async #ocrImage(url: string): Promise<string> {
|
|
269
|
+
try {
|
|
270
|
+
const resp = await fetch(url);
|
|
271
|
+
const buf = Buffer.from(await resp.arrayBuffer());
|
|
272
|
+
const worker = await this.#getOcrWorker();
|
|
273
|
+
const { data } = await worker.recognize(buf);
|
|
274
|
+
return data.text.trim();
|
|
275
|
+
} catch {
|
|
276
|
+
return url; // fallback na URL
|
|
277
|
+
}
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
get loggedIn(): boolean {
|
|
281
|
+
return this._loggedIn;
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
// -----------------------------------------------------------------------
|
|
285
|
+
// Login
|
|
286
|
+
// -----------------------------------------------------------------------
|
|
287
|
+
|
|
288
|
+
async login(username?: string, password?: string): Promise<void> {
|
|
289
|
+
const user = username ?? process.env.BIZISI_USERNAME;
|
|
290
|
+
const pass = password ?? process.env.BIZISI_PASSWORD;
|
|
291
|
+
if (!user || !pass) throw new Error("Missing BIZISI_USERNAME or BIZISI_PASSWORD");
|
|
292
|
+
|
|
293
|
+
// 1. GET login page — poberi __VIEWSTATE itd.
|
|
294
|
+
const loginPage = await fetchWithCookies(`${BASE_URL}/prijava/`, this.jar);
|
|
295
|
+
const html = await loginPage.text();
|
|
296
|
+
|
|
297
|
+
const aspFields = extractAspFormFields(html);
|
|
298
|
+
if (!aspFields["__VIEWSTATE"]) {
|
|
299
|
+
throw new Error("Could not find __VIEWSTATE on login page");
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
// 2. POST login form
|
|
303
|
+
const formData: Record<string, string> = {
|
|
304
|
+
...aspFields,
|
|
305
|
+
"ctl00$cphMain$loginBox1$UserName": user,
|
|
306
|
+
"ctl00$cphMain$loginBox1$Password": pass,
|
|
307
|
+
"ctl00$cphMain$loginBox1$ButtonLogin": "Prijavi se",
|
|
308
|
+
"ctl00$cphMain$loginBox1$cvUN": "",
|
|
309
|
+
"ctl00$cphMain$loginBox1$cvPW": "",
|
|
310
|
+
__LASTFOCUS: "",
|
|
311
|
+
__SCROLLPOSITION: "",
|
|
312
|
+
};
|
|
313
|
+
|
|
314
|
+
const loginRes = await fetchWithCookies(`${BASE_URL}/prijava/?rw=1`, this.jar, {
|
|
315
|
+
method: "POST",
|
|
316
|
+
body: encodeForm(formData),
|
|
317
|
+
headers: {
|
|
318
|
+
Referer: `${BASE_URL}/prijava/`,
|
|
319
|
+
Origin: BASE_URL,
|
|
320
|
+
},
|
|
321
|
+
});
|
|
322
|
+
|
|
323
|
+
const loginHtml = await loginRes.text();
|
|
324
|
+
|
|
325
|
+
// Preveri, če je prijava uspela
|
|
326
|
+
if (loginHtml.includes("Prijavi se") && !loginHtml.includes("Moj Bizi")) {
|
|
327
|
+
if (/napačno|neveljaven|Presegli/i.test(loginHtml)) {
|
|
328
|
+
throw new Error("Login failed: invalid credentials or rate-limited");
|
|
329
|
+
}
|
|
330
|
+
throw new Error("Login failed: could not verify");
|
|
331
|
+
}
|
|
332
|
+
|
|
333
|
+
this._loggedIn = true;
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
// -----------------------------------------------------------------------
|
|
337
|
+
// Search
|
|
338
|
+
// -----------------------------------------------------------------------
|
|
339
|
+
|
|
340
|
+
async search(query: string): Promise<CompanySearchResult[]> {
|
|
341
|
+
const res = await fetchWithCookies(
|
|
342
|
+
`${BASE_URL}/iskanje?q=${encodeURIComponent(query)}`,
|
|
343
|
+
this.jar,
|
|
344
|
+
);
|
|
345
|
+
const html = await res.text();
|
|
346
|
+
|
|
347
|
+
return this.#parseSearchResults(html);
|
|
348
|
+
}
|
|
349
|
+
|
|
350
|
+
#parseSearchResults(html: string): CompanySearchResult[] {
|
|
351
|
+
const results: CompanySearchResult[] = [];
|
|
352
|
+
|
|
353
|
+
// Najdi vrstice rezultatov: div.row.b-table-row z id-jem, ki vsebuje RepeaterResults
|
|
354
|
+
const rowRegex =
|
|
355
|
+
/<div[^>]*id="ctl00_cphMain_DisplayRecords1_RepeaterResults_[^"]*"[^>]*class="[^"]*row b-table-row[^"]*"[^>]*>([\s\S]*?)<\/div>\s*<\/div>/gi;
|
|
356
|
+
let rowMatch: RegExpExecArray | null;
|
|
357
|
+
|
|
358
|
+
while ((rowMatch = rowRegex.exec(html)) !== null) {
|
|
359
|
+
const rowHtml = rowMatch[1]!;
|
|
360
|
+
|
|
361
|
+
// Company link
|
|
362
|
+
const linkMatch = rowHtml.match(
|
|
363
|
+
/<a[^>]*class="[^"]*b-link-company[^"]*"[^>]*href="([^"]*)"[^>]*>[\s\S]*?<span[^>]*class="b-company-title"[^>]*>([\s\S]*?)<\/span>/i,
|
|
364
|
+
);
|
|
365
|
+
if (!linkMatch) continue;
|
|
366
|
+
|
|
367
|
+
const href = linkMatch[1]!;
|
|
368
|
+
const name = normalize(stripTags(linkMatch[2]!));
|
|
369
|
+
|
|
370
|
+
// Celice: div.col.b-table-cell
|
|
371
|
+
const cells = Array.from(rowHtml.matchAll(/<div[^>]*class="[^"]*col b-table-cell[^"]*"[^>]*>([\s\S]*?)<\/div>/gi));
|
|
372
|
+
|
|
373
|
+
const getCell = (idx: number): string => {
|
|
374
|
+
if (idx >= cells.length) return "";
|
|
375
|
+
return normalize(stripTags(cells[idx]![1]!));
|
|
376
|
+
};
|
|
377
|
+
|
|
378
|
+
// Cells: 0=checkbox, 1=name, 2=address, 3=city, 4=reg#, 5=tax#, 6=activity, 7=employees
|
|
379
|
+
const url = href.startsWith("http") ? "/" + href.split("/").slice(3).join("/") : href;
|
|
380
|
+
|
|
381
|
+
results.push({
|
|
382
|
+
name,
|
|
383
|
+
url: url.replace(/\/+$/, "") + "/",
|
|
384
|
+
address: getCell(2),
|
|
385
|
+
city: getCell(3),
|
|
386
|
+
registrationNumber: getCell(4),
|
|
387
|
+
taxNumber: getCell(5),
|
|
388
|
+
activity: getCell(6),
|
|
389
|
+
employees: parseEmployees(getCell(7)),
|
|
390
|
+
});
|
|
391
|
+
}
|
|
392
|
+
|
|
393
|
+
return results;
|
|
394
|
+
}
|
|
395
|
+
|
|
396
|
+
// -----------------------------------------------------------------------
|
|
397
|
+
// Company detail
|
|
398
|
+
// -----------------------------------------------------------------------
|
|
399
|
+
|
|
400
|
+
async getCompanyDetail(relativeUrl: string): Promise<CompanyDetail> {
|
|
401
|
+
const url = `${BASE_URL}${relativeUrl.startsWith("/") ? "" : "/"}${relativeUrl}`;
|
|
402
|
+
const res = await fetchWithCookies(url, this.jar);
|
|
403
|
+
const html = await res.text();
|
|
404
|
+
|
|
405
|
+
// Odstrani skripte in stile za čistejše regex ujemanje
|
|
406
|
+
const cleanHtml = html
|
|
407
|
+
.replace(/<script[\s\S]*?<\/script>/gi, "")
|
|
408
|
+
.replace(/<style[\s\S]*?<\/style>/gi, "")
|
|
409
|
+
.replace(/ /g, " ")
|
|
410
|
+
.replace(/&/g, "&");
|
|
411
|
+
|
|
412
|
+
const fullText = normalize(stripTags(cleanHtml));
|
|
413
|
+
|
|
414
|
+
// Poišči položaj h1 — za njim je glavna vsebina, pred njim sidebar
|
|
415
|
+
const h1Start = cleanHtml.indexOf('<h1');
|
|
416
|
+
const afterH1Html = h1Start >= 0 ? cleanHtml.substring(h1Start) : cleanHtml;
|
|
417
|
+
|
|
418
|
+
// Helper: extractBetween na HTML-ju za h1
|
|
419
|
+
const extractAfterH1 = (start: string, end: string): string => {
|
|
420
|
+
const s = afterH1Html.indexOf(start);
|
|
421
|
+
if (s < 0) return '';
|
|
422
|
+
const begin = s + start.length;
|
|
423
|
+
const e = end ? afterH1Html.indexOf(end, begin) : afterH1Html.length;
|
|
424
|
+
if (e < 0) return afterH1Html.substring(begin).trim();
|
|
425
|
+
return afterH1Html.substring(begin, e).trim();
|
|
426
|
+
};
|
|
427
|
+
|
|
428
|
+
// --- Header info ---
|
|
429
|
+
const h1Match = cleanHtml.match(/<h1[^>]*>([\s\S]*?)<\/h1>/i);
|
|
430
|
+
const h1Name = h1Match ? normalize(stripTags(h1Match[1]!)) : "";
|
|
431
|
+
const titleMatch = cleanHtml.match(/<title[^>]*>([\s\S]*?)<\/title>/i);
|
|
432
|
+
const title = titleMatch ? titleMatch[1]!.replace(/ - .*$/, "").trim() : "";
|
|
433
|
+
const name = h1Name || title;
|
|
434
|
+
|
|
435
|
+
// Phone: poišči tel: povezavo
|
|
436
|
+
const phoneMatch = cleanHtml.match(/<a[^>]*href="tel:([^"]*)"[^>]*>([\s\S]*?)<\/a>/i);
|
|
437
|
+
const phone = phoneMatch ? normalize(phoneMatch[2] ?? phoneMatch[1]!) : "";
|
|
438
|
+
|
|
439
|
+
// Email: poišči mailto: povezavo
|
|
440
|
+
const emailMatch = cleanHtml.match(/<a[^>]*href="mailto:([^"]*)"[^>]*>([\s\S]*?)<\/a>/i);
|
|
441
|
+
const email = emailMatch ? normalize(emailMatch[2] ?? emailMatch[1]!) : "";
|
|
442
|
+
|
|
443
|
+
// Website: poišči v header delu (za h1, pred h5) link z URL-jem kot besedilom
|
|
444
|
+
let website = "";
|
|
445
|
+
const headerHtml = extractAfterH1("</h1>", "Matično podjetje");
|
|
446
|
+
// Poisci <a> brez href (aspNetDisabled) kjer je text URL, ali <a> s href
|
|
447
|
+
const headerLinks = Array.from(headerHtml.matchAll(/<a[^>]*>([\s\S]*?)<\/a>/gi));
|
|
448
|
+
for (const link of headerLinks) {
|
|
449
|
+
const text = normalize(link[1]!);
|
|
450
|
+
if (text.startsWith("http") && !text.includes("bizi.si")) {
|
|
451
|
+
website = text;
|
|
452
|
+
break;
|
|
453
|
+
}
|
|
454
|
+
}
|
|
455
|
+
|
|
456
|
+
// Address: vzemi samo vrstico z naslovom (vsebuje številko in poštno številko)
|
|
457
|
+
let address = "";
|
|
458
|
+
const addrRaw = extractAfterH1("</h1>", '<h5 class="b-box-title');
|
|
459
|
+
if (addrRaw) {
|
|
460
|
+
const addrText = normalize(stripTags(addrRaw));
|
|
461
|
+
// Vzemi besedilo do prvega emaila, telefona (3 cifre + presledek + 3-6 cifre), "Več" ali "Dodaj"
|
|
462
|
+
const upToMarker = addrText.split(
|
|
463
|
+
/\s+\S+@\S+|\s+\d{3}\s\d{3,6}|\s+Več\s+|\s+Dodaj\s+|\s+http/i,
|
|
464
|
+
)[0]?.trim() ?? '';
|
|
465
|
+
// Izlušči del s poštno številko (4-mestna)
|
|
466
|
+
const m = upToMarker.match(/([A-ZČŠŽa-zčšž][^,]+\d+[^,]*,\s*[^,]+,\s*\d{4}\s+[A-ZČŠŽa-zčšž][^,]+)/);
|
|
467
|
+
if (m) address = m[1]!.trim();
|
|
468
|
+
}
|
|
469
|
+
|
|
470
|
+
// --- Matično podjetje section (only after h1) ---
|
|
471
|
+
const maticnoHtml = extractAfterH1("Matično podjetje", "Ključne osebe");
|
|
472
|
+
const maticnoText = normalize(stripTags(maticnoHtml));
|
|
473
|
+
|
|
474
|
+
const davcnaM = maticnoText.match(/Davčna številka.*?(\d{7,9})/i);
|
|
475
|
+
const maticnaM = maticnoText.match(/Matična.*?(\d{7,10})/i);
|
|
476
|
+
const ddvM = maticnoText.match(/Zavezanec za DDV.*?(Da|Ne)/i);
|
|
477
|
+
const skisM = maticnoText.match(/SKIS\s*:?\s*([^\n]+?)(?:\s+(?:Datum|Vir))/i);
|
|
478
|
+
const vpisM = maticnoText.match(/Datum vpisa\s*:?\s*([^\n]+?)(?:\s+(?:Dejavnost|Vir))/i);
|
|
479
|
+
const dejavnostM = maticnoText.match(/Dejavnost TSmedia\s*:?\s*([^\n]+?)(?:\s+Vir)/i);
|
|
480
|
+
|
|
481
|
+
const taxNumber = davcnaM ? davcnaM[1]! : "";
|
|
482
|
+
const registrationNumber = maticnaM ? maticnaM[1]! : "";
|
|
483
|
+
const vatPayer = ddvM ? ddvM[1]!.toLowerCase() === "da" : false;
|
|
484
|
+
const vatNumM = fullText.match(/SI(\d{7,9})/);
|
|
485
|
+
const vatNumber = vatNumM ? "SI" + vatNumM[1]! : "";
|
|
486
|
+
const skis = skisM ? skisM[1]!.trim() : "";
|
|
487
|
+
const dateOfEntry = vpisM ? formatDate(vpisM[1]!.trim()) : "";
|
|
488
|
+
const activity = dejavnostM ? dejavnostM[1]!.trim() : "";
|
|
489
|
+
|
|
490
|
+
// --- TRR (only after h1) ---
|
|
491
|
+
const trrHtml = extractAfterH1("TRR in blokade", "Bizi obveščevalec");
|
|
492
|
+
const trrText = normalize(stripTags(trrHtml));
|
|
493
|
+
const bankAccounts: BankAccount[] = [];
|
|
494
|
+
const ibanRegex = /(SI\d{2}\s*\d{4}\s*\d{4}\s*\d{4}\s*\d{3})/g;
|
|
495
|
+
let ibanM: RegExpExecArray | null;
|
|
496
|
+
while ((ibanM = ibanRegex.exec(trrText)) !== null) {
|
|
497
|
+
bankAccounts.push({
|
|
498
|
+
iban: ibanM[1]!.replace(/\s+/g, " ").trim(),
|
|
499
|
+
status: "odprt",
|
|
500
|
+
openedDate: null,
|
|
501
|
+
bank: null,
|
|
502
|
+
});
|
|
503
|
+
}
|
|
504
|
+
|
|
505
|
+
// --- Key people (only after h1, before Bonitetna ocena) ---
|
|
506
|
+
const osebeHtml = extractAfterH1("Ključne osebe", "Bonitetna ocena");
|
|
507
|
+
const keyPeople = await this.#ocrKeyPeople(this.#parseKeyPeople(osebeHtml));
|
|
508
|
+
|
|
509
|
+
// --- Financiranje iz proračuna (only after h1) ---
|
|
510
|
+
const finHtml = extractAfterH1("Financiranje iz proračuna", "Ne spreglejte");
|
|
511
|
+
const finText = normalize(stripTags(finHtml));
|
|
512
|
+
const financingFromBudget = this.#parseFinancing(finText);
|
|
513
|
+
|
|
514
|
+
return {
|
|
515
|
+
name,
|
|
516
|
+
fullAddress: address || "",
|
|
517
|
+
phone,
|
|
518
|
+
website,
|
|
519
|
+
email,
|
|
520
|
+
taxNumber,
|
|
521
|
+
registrationNumber,
|
|
522
|
+
vatPayer,
|
|
523
|
+
vatNumber,
|
|
524
|
+
skis,
|
|
525
|
+
dateOfEntry,
|
|
526
|
+
activity,
|
|
527
|
+
keyPeople,
|
|
528
|
+
bankAccounts,
|
|
529
|
+
financingFromBudget,
|
|
530
|
+
};
|
|
531
|
+
}
|
|
532
|
+
|
|
533
|
+
// -----------------------------------------------------------------------
|
|
534
|
+
// Financial data
|
|
535
|
+
// -----------------------------------------------------------------------
|
|
536
|
+
|
|
537
|
+
async getFinancialData(relativeUrl: string): Promise<FinancialData[]> {
|
|
538
|
+
const basePath = relativeUrl.replace(/\/+$/, "");
|
|
539
|
+
const finUrl = `${BASE_URL}${basePath}/financni-podatki/`;
|
|
540
|
+
|
|
541
|
+
const res = await fetchWithCookies(finUrl, this.jar);
|
|
542
|
+
const html = await res.text();
|
|
543
|
+
|
|
544
|
+
// Preveri paywall
|
|
545
|
+
if (/registriraj.*se.*brezplačno|naroči.*polni.*dostop|preizkusi/i.test(html)) {
|
|
546
|
+
return [];
|
|
547
|
+
}
|
|
548
|
+
|
|
549
|
+
const cleanHtml = html
|
|
550
|
+
.replace(/<script[\s\S]*?<\/script>/gi, "")
|
|
551
|
+
.replace(/<style[\s\S]*?<\/style>/gi, "");
|
|
552
|
+
|
|
553
|
+
// Poišči tabelo s finančnimi podatki
|
|
554
|
+
const tables = parseTable(cleanHtml);
|
|
555
|
+
const data: FinancialData[] = [];
|
|
556
|
+
|
|
557
|
+
for (const row of tables) {
|
|
558
|
+
const joined = row.join(" ");
|
|
559
|
+
const yearMatch = joined.match(/\b(20[2-9]\d)\b/);
|
|
560
|
+
if (!yearMatch) continue;
|
|
561
|
+
|
|
562
|
+
const year = parseInt(yearMatch[1]!, 10);
|
|
563
|
+
// Prvi večji znesek = čisti prihodki, drugi = poslovni izzid
|
|
564
|
+
const amounts = row
|
|
565
|
+
.map((c) => parseSlovenianNumber(c))
|
|
566
|
+
.filter((n): n is number => n !== null && n > 0);
|
|
567
|
+
|
|
568
|
+
data.push({
|
|
569
|
+
year,
|
|
570
|
+
netSalesRevenue: amounts[0] ?? null,
|
|
571
|
+
operatingResult: amounts[1] ?? null,
|
|
572
|
+
});
|
|
573
|
+
}
|
|
574
|
+
|
|
575
|
+
return data;
|
|
576
|
+
}
|
|
577
|
+
|
|
578
|
+
// -----------------------------------------------------------------------
|
|
579
|
+
// Scrape
|
|
580
|
+
// -----------------------------------------------------------------------
|
|
581
|
+
|
|
582
|
+
async scrapeCompany(relativeUrl: string): Promise<ScrapeResult> {
|
|
583
|
+
const company = await this.getCompanyDetail(relativeUrl);
|
|
584
|
+
const financialData = await this.getFinancialData(relativeUrl);
|
|
585
|
+
return { company, financialData };
|
|
586
|
+
}
|
|
587
|
+
|
|
588
|
+
async scrape(
|
|
589
|
+
query: string,
|
|
590
|
+
options?: { maxResults?: number; includeFinancialData?: boolean },
|
|
591
|
+
): Promise<ScrapeResult[]> {
|
|
592
|
+
const maxResults = options?.maxResults ?? 0;
|
|
593
|
+
const includeFinancial = options?.includeFinancialData ?? true;
|
|
594
|
+
|
|
595
|
+
const results = await this.search(query);
|
|
596
|
+
const limited = maxResults > 0 ? results.slice(0, maxResults) : results;
|
|
597
|
+
|
|
598
|
+
const output: ScrapeResult[] = [];
|
|
599
|
+
for (const result of limited) {
|
|
600
|
+
try {
|
|
601
|
+
const scraped = await this.scrapeCompany(result.url);
|
|
602
|
+
output.push(scraped);
|
|
603
|
+
} catch (err) {
|
|
604
|
+
console.error(`Failed to scrape ${result.name}:`, err);
|
|
605
|
+
output.push({
|
|
606
|
+
company: {
|
|
607
|
+
name: result.name,
|
|
608
|
+
fullAddress: "",
|
|
609
|
+
phone: "",
|
|
610
|
+
website: "",
|
|
611
|
+
email: "",
|
|
612
|
+
taxNumber: result.taxNumber,
|
|
613
|
+
registrationNumber: result.registrationNumber,
|
|
614
|
+
vatPayer: false,
|
|
615
|
+
vatNumber: "",
|
|
616
|
+
skis: "",
|
|
617
|
+
dateOfEntry: "",
|
|
618
|
+
activity: result.activity,
|
|
619
|
+
keyPeople: { nadzorniki: [], ustanovitelji: [], zastopniki: [] },
|
|
620
|
+
bankAccounts: [],
|
|
621
|
+
financingFromBudget: {},
|
|
622
|
+
},
|
|
623
|
+
financialData: [],
|
|
624
|
+
});
|
|
625
|
+
}
|
|
626
|
+
}
|
|
627
|
+
|
|
628
|
+
return output;
|
|
629
|
+
}
|
|
630
|
+
|
|
631
|
+
// -----------------------------------------------------------------------
|
|
632
|
+
// Private helpers
|
|
633
|
+
// -----------------------------------------------------------------------
|
|
634
|
+
|
|
635
|
+
/** Statusna sporočila, ki niso imena oseb. */
|
|
636
|
+
static #statusPhrases = [
|
|
637
|
+
'ni vpisan', 'ni vpisanih', 'ni aktivnih',
|
|
638
|
+
'trenutno ni', 'ni podatka', 'ni zabeleženih',
|
|
639
|
+
'ni najdenih', 'brez podatka',
|
|
640
|
+
];
|
|
641
|
+
|
|
642
|
+
static #isStatus(text: string): boolean {
|
|
643
|
+
const lower = text.toLowerCase();
|
|
644
|
+
return BizisiScraper.#statusPhrases.some((p) => lower.includes(p));
|
|
645
|
+
}
|
|
646
|
+
|
|
647
|
+
/**
|
|
648
|
+
* Razčleni HTML sekcije "Ključne osebe" in vrne KeyPeople z ImageGenerator URL-ji.
|
|
649
|
+
* Vsaka oseba je v svojem <span class="...b-attr-value...">.
|
|
650
|
+
*/
|
|
651
|
+
#parseKeyPeople(html: string): KeyPeople {
|
|
652
|
+
const result: KeyPeople = { nadzorniki: [], ustanovitelji: [], zastopniki: [] };
|
|
653
|
+
if (!html) return result;
|
|
654
|
+
|
|
655
|
+
const cats = [
|
|
656
|
+
{ label: 'Nadzorniki:', key: 'nadzorniki' as const },
|
|
657
|
+
{ label: 'Ustanovitelji:', key: 'ustanovitelji' as const },
|
|
658
|
+
{ label: 'Zastopniki:', key: 'zastopniki' as const },
|
|
659
|
+
];
|
|
660
|
+
|
|
661
|
+
for (let ci = 0; ci < cats.length; ci++) {
|
|
662
|
+
const { label, key } = cats[ci]!;
|
|
663
|
+
const labelIdx = html.indexOf(label);
|
|
664
|
+
if (labelIdx < 0) continue;
|
|
665
|
+
|
|
666
|
+
const nextLabel = ci + 1 < cats.length ? cats[ci + 1]!.label : null;
|
|
667
|
+
const nextIdx = nextLabel ? html.indexOf(nextLabel, labelIdx + label.length) : html.length;
|
|
668
|
+
const sectionHtml = html.substring(labelIdx + label.length, nextIdx > 0 ? nextIdx : html.length);
|
|
669
|
+
|
|
670
|
+
// Vsak <span class="...b-attr-value..."> je ena oseba z 1+ slikami
|
|
671
|
+
const personSpans = Array.from(
|
|
672
|
+
sectionHtml.matchAll(/<span[^>]*class="[^"]*b-attr-value[^"]*"[^>]*>([\s\S]*?)<\/span>/gi),
|
|
673
|
+
);
|
|
674
|
+
|
|
675
|
+
for (const span of personSpans) {
|
|
676
|
+
const imgs = Array.from(span[1]!.matchAll(/<img[^>]*src="([^"]*ImageGenerator[^"]*)"[^>]*>/gi));
|
|
677
|
+
const urls = imgs.map((m) => m[1]!);
|
|
678
|
+
if (urls.length > 0) result[key].push(urls.join('|||')); // delimiter za OCR
|
|
679
|
+
}
|
|
680
|
+
}
|
|
681
|
+
|
|
682
|
+
return result;
|
|
683
|
+
}
|
|
684
|
+
|
|
685
|
+
async #ocrKeyPeople(raw: KeyPeople): Promise<KeyPeople> {
|
|
686
|
+
const result: KeyPeople = { nadzorniki: [], ustanovitelji: [], zastopniki: [] };
|
|
687
|
+
|
|
688
|
+
for (const group of ['nadzorniki', 'ustanovitelji', 'zastopniki'] as const) {
|
|
689
|
+
for (const entry of raw[group]) {
|
|
690
|
+
const urls = entry.split('|||');
|
|
691
|
+
if (urls.length === 0) continue;
|
|
692
|
+
|
|
693
|
+
const texts = await Promise.all(urls.map((url: string) => this.#ocrImage(url)));
|
|
694
|
+
const joined = texts.join(' ').trim();
|
|
695
|
+
|
|
696
|
+
if (!joined || BizisiScraper.#isStatus(joined)) continue;
|
|
697
|
+
|
|
698
|
+
// Odstrani vloge (direktor, prokurist, ...)
|
|
699
|
+
const cleaned = joined.replace(/\s*,?\s*(?:direktor|prokurist|član|predsednik|namestnik|vodja)\s*,?\s*/gi, '').trim();
|
|
700
|
+
if (cleaned) result[group].push(cleaned);
|
|
701
|
+
}
|
|
702
|
+
}
|
|
703
|
+
|
|
704
|
+
return result;
|
|
705
|
+
}
|
|
706
|
+
|
|
707
|
+
#parseFinancing(text: string): Record<number, number> {
|
|
708
|
+
const result: Record<number, number> = {};
|
|
709
|
+
const regex = /\b(20[0-9]{2})\D{1,20}?(\d{1,3}(?:\.\d{3})*(?:,\d{2})?)\s*(?:EUR|€)?/g;
|
|
710
|
+
let m: RegExpExecArray | null;
|
|
711
|
+
while ((m = regex.exec(text)) !== null) {
|
|
712
|
+
const year = parseInt(m[1]!, 10);
|
|
713
|
+
const amount = parseSlovenianNumber(m[2] ?? null);
|
|
714
|
+
if (year >= 2010 && year <= 2030 && amount !== null) {
|
|
715
|
+
result[year] = amount;
|
|
716
|
+
}
|
|
717
|
+
}
|
|
718
|
+
return result;
|
|
719
|
+
}
|
|
720
|
+
}
|