cry-bizisi-parser 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/bizisi.ts ADDED
@@ -0,0 +1,720 @@
1
+ /**
2
+ * bizi.si scraper — čisti Bun fetch, brez brskalnika.
3
+ *
4
+ * bizi.si uporablja ASP.NET WebForms, zato je treba:
5
+ * 1. Shranjevati piškotke (ASP.NET_SessionId, .ASPXAUTH)
6
+ * 2. Prenašati __VIEWSTATE in __EVENTVALIDATION med GET in POST
7
+ *
8
+ * Odvisnosti: 0 — samo Bun built-in API-ji.
9
+ */
10
+
11
+ import { createWorker, type Worker } from "tesseract.js";
12
+
13
+ import type {
14
+ CompanySearchResult,
15
+ CompanyDetail,
16
+ FinancialData,
17
+ BankAccount,
18
+ KeyPeople,
19
+ ScrapeResult,
20
+ } from "./types.js";
21
+
22
+ // ---------------------------------------------------------------------------
23
+ // Config
24
+ // ---------------------------------------------------------------------------
25
+
26
+ const BASE_URL = "https://www.bizi.si";
27
+
28
+ // ---------------------------------------------------------------------------
29
+ // Cookie jar — ročno upravljanje prek Set-Cookie / Cookie
30
+ // ---------------------------------------------------------------------------
31
+
32
+ class CookieJar {
33
+ private cookies = new Map<string, { value: string; domain: string; path: string }>();
34
+
35
+ /** Prebere Set-Cookie headerje in shrani (Node.js + Bun). */
36
+ setFromHeaders(headers: Headers): void {
37
+ // Node.js 19+ ima getSetCookie(), Bun ima getAll()
38
+ let setCookieValues: string[];
39
+ if (typeof (headers as any).getSetCookie === "function") {
40
+ setCookieValues = (headers as any).getSetCookie();
41
+ } else if (typeof (headers as any).getAll === "function") {
42
+ const raw = (headers as any).getAll("Set-Cookie");
43
+ setCookieValues = Array.isArray(raw) ? raw : [raw];
44
+ } else {
45
+ setCookieValues = [];
46
+ for (const [k, v] of headers as any) {
47
+ if (k.toLowerCase() === "set-cookie") setCookieValues.push(v);
48
+ }
49
+ }
50
+
51
+ for (const entry of setCookieValues) {
52
+ if (!entry) continue;
53
+ const [nv, ...rest] = entry.split(";");
54
+ if (!nv) continue;
55
+ const eq = nv.indexOf("=");
56
+ if (eq < 0) continue;
57
+ const name = nv.substring(0, eq).trim();
58
+ const value = nv.substring(eq + 1).trim();
59
+
60
+ let domain = ".bizi.si";
61
+ let path = "/";
62
+ for (const attr of rest) {
63
+ const a = attr.trim();
64
+ if (a.toLowerCase().startsWith("domain=")) domain = a.split("=")[1] ?? domain;
65
+ if (a.toLowerCase().startsWith("path=")) path = a.split("=")[1] ?? path;
66
+ if (a.toLowerCase() === "max-age=0" || a.toLowerCase().startsWith("expires=")) {
67
+ // Could handle expiry but not critical
68
+ }
69
+ }
70
+ this.cookies.set(name, { value, domain, path });
71
+ }
72
+ }
73
+
74
+ /** Vrne Cookie header za dani URL. */
75
+ getRequestHeader(): string {
76
+ const parts: string[] = [];
77
+ for (const [name, c] of this.cookies) {
78
+ if (c.domain && ".bizi.si".endsWith(c.domain)) {
79
+ parts.push(`${name}=${c.value}`);
80
+ }
81
+ }
82
+ return parts.join("; ");
83
+ }
84
+
85
+ /** Izbriši vse piškotke. */
86
+ clear(): void {
87
+ this.cookies.clear();
88
+ }
89
+ }
90
+
91
+ // ---------------------------------------------------------------------------
92
+ // HTTP helper — fetch s piškotki
93
+ // ---------------------------------------------------------------------------
94
+
95
+ async function fetchWithCookies(
96
+ url: string,
97
+ jar: CookieJar,
98
+ init: RequestInit = {},
99
+ ): Promise<Response> {
100
+ const headers = new Headers(init.headers);
101
+ // Default headers
102
+ if (!headers.has("User-Agent")) {
103
+ headers.set(
104
+ "User-Agent",
105
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
106
+ );
107
+ }
108
+ if (!headers.has("Accept")) {
109
+ headers.set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
110
+ }
111
+ if (!headers.has("Accept-Language")) {
112
+ headers.set("Accept-Language", "sl-SI,sl;q=0.9,en;q=0.8");
113
+ }
114
+
115
+ const cookie = jar.getRequestHeader();
116
+ if (cookie) headers.set("Cookie", cookie);
117
+
118
+ if (init.body && !headers.has("Content-Type")) {
119
+ headers.set("Content-Type", "application/x-www-form-urlencoded");
120
+ }
121
+
122
+ const res = await fetch(url, { ...init, headers, redirect: "manual" });
123
+
124
+ // Shrani nove piškotke
125
+ jar.setFromHeaders(res.headers);
126
+
127
+ // Sledi redirectom ročno (da ohranimo piškotke)
128
+ if (res.status >= 300 && res.status < 400) {
129
+ const location = res.headers.get("location");
130
+ if (location) {
131
+ const redirectUrl = location.startsWith("http") ? location : `${new URL(url).origin}${location}`;
132
+ return fetchWithCookies(redirectUrl, jar, { method: "GET" });
133
+ }
134
+ }
135
+
136
+ return res;
137
+ }
138
+
139
+ // ---------------------------------------------------------------------------
140
+ // ASP.NET WebForms helperji
141
+ // ---------------------------------------------------------------------------
142
+
143
+ /** Izlušči __VIEWSTATE, __EVENTVALIDATION itd. iz HTML. */
144
+ function extractAspFormFields(html: string): Record<string, string> {
145
+ const fields: Record<string, string> = {};
146
+ const regex = /<input[^>]*(?:name|id)="(?:__VIEWSTATE|__EVENTVALIDATION|__VIEWSTATEGENERATOR)"[^>]*\/?>/gi;
147
+ let m: RegExpExecArray | null;
148
+ while ((m = regex.exec(html)) !== null) {
149
+ const nameMatch = m[0].match(/name="([^"]+)"/);
150
+ const valueMatch = m[0].match(/value="([^"]*)"/);
151
+ if (nameMatch) {
152
+ fields[nameMatch[1]!] = valueMatch ? valueMatch[1]! : "";
153
+ }
154
+ }
155
+ return fields;
156
+ }
157
+
158
+ /** Build URL-encoded form body. */
159
+ function encodeForm(data: Record<string, string>): string {
160
+ return Object.entries(data)
161
+ .map(([k, v]) => `${encodeURIComponent(k)}=${encodeURIComponent(v)}`)
162
+ .join("&");
163
+ }
164
+
165
+ // ---------------------------------------------------------------------------
166
+ // Parser helperji
167
+ // ---------------------------------------------------------------------------
168
+
169
+ function normalize(str: string): string {
170
+ return str.replace(/\s+/g, " ").trim();
171
+ }
172
+
173
+ /** Pretvori slovenski datum "25. 3. 2015" v ISO "2015-03-25". */
174
+ function formatDate(raw: string): string {
175
+ const m = raw.match(/(\d{1,2})\.\s*(\d{1,2})\.\s*(\d{4})/);
176
+ if (!m) return raw;
177
+ const d = m[1]!.padStart(2, "0");
178
+ const mo = m[2]!.padStart(2, "0");
179
+ const y = m[3]!;
180
+ return `${y}-${mo}-${d}`;
181
+ }
182
+
183
+ function parseEmployees(raw: string): number {
184
+ const s = raw.trim();
185
+ if (!s || /^(ni|brez|ne|\/)/i.test(s)) return 0;
186
+ const range = s.match(/(\d+)\s*(?:ali|do|-|–|,)\s*(\d+)/i);
187
+ if (range) return (parseInt(range[1] ?? "0", 10) + parseInt(range[2] ?? "0", 10)) / 2;
188
+ const single = s.match(/(\d+)/);
189
+ return single ? parseInt(single[1] ?? "0", 10) : 0;
190
+ }
191
+
192
+ function parseSlovenianNumber(raw: string | null): number | null {
193
+ if (!raw) return null;
194
+ let s = raw.trim();
195
+ if (s === "" || s === "-") return null;
196
+ if (s === "0") return 0;
197
+ s = s.replace(/\./g, "").replace(/,/, ".");
198
+ const n = parseFloat(s);
199
+ return Number.isFinite(n) ? n : null;
200
+ }
201
+
202
+ /** Poišči besedilo med dvema oznakama v HTML (brez HTML tagov). */
203
+ function extractBetween(html: string, startMarker: string, endMarker: string): string {
204
+ const s = html.indexOf(startMarker);
205
+ if (s < 0) return "";
206
+ const start = s + startMarker.length;
207
+ const e = endMarker ? html.indexOf(endMarker, start) : html.length;
208
+ if (e < 0) return html.substring(start).trim();
209
+ return html.substring(start, e).trim();
210
+ }
211
+
212
+ /** Odstrani HTML tage. */
213
+ function stripTags(html: string): string {
214
+ return html.replace(/<[^>]+>/g, " ").replace(/&nbsp;/g, " ").replace(/&amp;/g, "&");
215
+ }
216
+
217
+ /** Poišči tabelo v HTML in vrni vrstice kot array celic. */
218
+ function parseTable(html: string, tableIdx = 0): string[][] {
219
+ const tables: string[][] = [];
220
+ const tableRegex = /<table[^>]*>([\s\S]*?)<\/table>/gi;
221
+ let tMatch: RegExpExecArray | null;
222
+ while ((tMatch = tableRegex.exec(html)) !== null) {
223
+ const rows: string[][] = [];
224
+ const rowRegex = /<tr[^>]*>([\s\S]*?)<\/tr>/gi;
225
+ let rMatch: RegExpExecArray | null;
226
+ while ((rMatch = rowRegex.exec(tMatch[1]!)) !== null) {
227
+ const cells: string[] = [];
228
+ const cellRegex = /<t[dh][^>]*>([\s\S]*?)<\/t[dh]>/gi;
229
+ let cMatch: RegExpExecArray | null;
230
+ while ((cMatch = cellRegex.exec(rMatch[1]!)) !== null) {
231
+ cells.push(normalize(stripTags(cMatch[1]!)));
232
+ }
233
+ if (cells.length > 0) rows.push(cells);
234
+ }
235
+ if (rows.length > 0) tables.push(rows.flat());
236
+ }
237
+ return tables;
238
+ }
239
+
240
+ // ---------------------------------------------------------------------------
241
+ // FetchBizisiScraper
242
+ // ---------------------------------------------------------------------------
243
+
244
+ export class BizisiScraper {
245
+ private jar = new CookieJar();
246
+ private _loggedIn = false;
247
+ private ocrWorker: Worker | null = null;
248
+
249
+ /** Vrne (ali ustvari) OCR worker. */
250
+ async #getOcrWorker(): Promise<Worker> {
251
+ if (!this.ocrWorker) {
252
+ this.ocrWorker = await createWorker("slv", 1, {
253
+ logger: () => {}, // utišaj log
254
+ });
255
+ }
256
+ return this.ocrWorker;
257
+ }
258
+
259
+ /** Počisti OCR worker (pokliči ko končaš). */
260
+ async close(): Promise<void> {
261
+ if (this.ocrWorker) {
262
+ try { await this.ocrWorker.terminate(); } catch {}
263
+ this.ocrWorker = null;
264
+ }
265
+ }
266
+
267
+ /** OCR ene slike — vrne prepoznano besedilo. */
268
+ async #ocrImage(url: string): Promise<string> {
269
+ try {
270
+ const resp = await fetch(url);
271
+ const buf = Buffer.from(await resp.arrayBuffer());
272
+ const worker = await this.#getOcrWorker();
273
+ const { data } = await worker.recognize(buf);
274
+ return data.text.trim();
275
+ } catch {
276
+ return url; // fallback na URL
277
+ }
278
+ }
279
+
280
+ get loggedIn(): boolean {
281
+ return this._loggedIn;
282
+ }
283
+
284
+ // -----------------------------------------------------------------------
285
+ // Login
286
+ // -----------------------------------------------------------------------
287
+
288
+ async login(username?: string, password?: string): Promise<void> {
289
+ const user = username ?? process.env.BIZISI_USERNAME;
290
+ const pass = password ?? process.env.BIZISI_PASSWORD;
291
+ if (!user || !pass) throw new Error("Missing BIZISI_USERNAME or BIZISI_PASSWORD");
292
+
293
+ // 1. GET login page — poberi __VIEWSTATE itd.
294
+ const loginPage = await fetchWithCookies(`${BASE_URL}/prijava/`, this.jar);
295
+ const html = await loginPage.text();
296
+
297
+ const aspFields = extractAspFormFields(html);
298
+ if (!aspFields["__VIEWSTATE"]) {
299
+ throw new Error("Could not find __VIEWSTATE on login page");
300
+ }
301
+
302
+ // 2. POST login form
303
+ const formData: Record<string, string> = {
304
+ ...aspFields,
305
+ "ctl00$cphMain$loginBox1$UserName": user,
306
+ "ctl00$cphMain$loginBox1$Password": pass,
307
+ "ctl00$cphMain$loginBox1$ButtonLogin": "Prijavi se",
308
+ "ctl00$cphMain$loginBox1$cvUN": "",
309
+ "ctl00$cphMain$loginBox1$cvPW": "",
310
+ __LASTFOCUS: "",
311
+ __SCROLLPOSITION: "",
312
+ };
313
+
314
+ const loginRes = await fetchWithCookies(`${BASE_URL}/prijava/?rw=1`, this.jar, {
315
+ method: "POST",
316
+ body: encodeForm(formData),
317
+ headers: {
318
+ Referer: `${BASE_URL}/prijava/`,
319
+ Origin: BASE_URL,
320
+ },
321
+ });
322
+
323
+ const loginHtml = await loginRes.text();
324
+
325
+ // Preveri, če je prijava uspela
326
+ if (loginHtml.includes("Prijavi se") && !loginHtml.includes("Moj Bizi")) {
327
+ if (/napačno|neveljaven|Presegli/i.test(loginHtml)) {
328
+ throw new Error("Login failed: invalid credentials or rate-limited");
329
+ }
330
+ throw new Error("Login failed: could not verify");
331
+ }
332
+
333
+ this._loggedIn = true;
334
+ }
335
+
336
+ // -----------------------------------------------------------------------
337
+ // Search
338
+ // -----------------------------------------------------------------------
339
+
340
+ async search(query: string): Promise<CompanySearchResult[]> {
341
+ const res = await fetchWithCookies(
342
+ `${BASE_URL}/iskanje?q=${encodeURIComponent(query)}`,
343
+ this.jar,
344
+ );
345
+ const html = await res.text();
346
+
347
+ return this.#parseSearchResults(html);
348
+ }
349
+
350
+ #parseSearchResults(html: string): CompanySearchResult[] {
351
+ const results: CompanySearchResult[] = [];
352
+
353
+ // Najdi vrstice rezultatov: div.row.b-table-row z id-jem, ki vsebuje RepeaterResults
354
+ const rowRegex =
355
+ /<div[^>]*id="ctl00_cphMain_DisplayRecords1_RepeaterResults_[^"]*"[^>]*class="[^"]*row b-table-row[^"]*"[^>]*>([\s\S]*?)<\/div>\s*<\/div>/gi;
356
+ let rowMatch: RegExpExecArray | null;
357
+
358
+ while ((rowMatch = rowRegex.exec(html)) !== null) {
359
+ const rowHtml = rowMatch[1]!;
360
+
361
+ // Company link
362
+ const linkMatch = rowHtml.match(
363
+ /<a[^>]*class="[^"]*b-link-company[^"]*"[^>]*href="([^"]*)"[^>]*>[\s\S]*?<span[^>]*class="b-company-title"[^>]*>([\s\S]*?)<\/span>/i,
364
+ );
365
+ if (!linkMatch) continue;
366
+
367
+ const href = linkMatch[1]!;
368
+ const name = normalize(stripTags(linkMatch[2]!));
369
+
370
+ // Celice: div.col.b-table-cell
371
+ const cells = Array.from(rowHtml.matchAll(/<div[^>]*class="[^"]*col b-table-cell[^"]*"[^>]*>([\s\S]*?)<\/div>/gi));
372
+
373
+ const getCell = (idx: number): string => {
374
+ if (idx >= cells.length) return "";
375
+ return normalize(stripTags(cells[idx]![1]!));
376
+ };
377
+
378
+ // Cells: 0=checkbox, 1=name, 2=address, 3=city, 4=reg#, 5=tax#, 6=activity, 7=employees
379
+ const url = href.startsWith("http") ? "/" + href.split("/").slice(3).join("/") : href;
380
+
381
+ results.push({
382
+ name,
383
+ url: url.replace(/\/+$/, "") + "/",
384
+ address: getCell(2),
385
+ city: getCell(3),
386
+ registrationNumber: getCell(4),
387
+ taxNumber: getCell(5),
388
+ activity: getCell(6),
389
+ employees: parseEmployees(getCell(7)),
390
+ });
391
+ }
392
+
393
+ return results;
394
+ }
395
+
396
+ // -----------------------------------------------------------------------
397
+ // Company detail
398
+ // -----------------------------------------------------------------------
399
+
400
+ async getCompanyDetail(relativeUrl: string): Promise<CompanyDetail> {
401
+ const url = `${BASE_URL}${relativeUrl.startsWith("/") ? "" : "/"}${relativeUrl}`;
402
+ const res = await fetchWithCookies(url, this.jar);
403
+ const html = await res.text();
404
+
405
+ // Odstrani skripte in stile za čistejše regex ujemanje
406
+ const cleanHtml = html
407
+ .replace(/<script[\s\S]*?<\/script>/gi, "")
408
+ .replace(/<style[\s\S]*?<\/style>/gi, "")
409
+ .replace(/&nbsp;/g, " ")
410
+ .replace(/&amp;/g, "&");
411
+
412
+ const fullText = normalize(stripTags(cleanHtml));
413
+
414
+ // Poišči položaj h1 — za njim je glavna vsebina, pred njim sidebar
415
+ const h1Start = cleanHtml.indexOf('<h1');
416
+ const afterH1Html = h1Start >= 0 ? cleanHtml.substring(h1Start) : cleanHtml;
417
+
418
+ // Helper: extractBetween na HTML-ju za h1
419
+ const extractAfterH1 = (start: string, end: string): string => {
420
+ const s = afterH1Html.indexOf(start);
421
+ if (s < 0) return '';
422
+ const begin = s + start.length;
423
+ const e = end ? afterH1Html.indexOf(end, begin) : afterH1Html.length;
424
+ if (e < 0) return afterH1Html.substring(begin).trim();
425
+ return afterH1Html.substring(begin, e).trim();
426
+ };
427
+
428
+ // --- Header info ---
429
+ const h1Match = cleanHtml.match(/<h1[^>]*>([\s\S]*?)<\/h1>/i);
430
+ const h1Name = h1Match ? normalize(stripTags(h1Match[1]!)) : "";
431
+ const titleMatch = cleanHtml.match(/<title[^>]*>([\s\S]*?)<\/title>/i);
432
+ const title = titleMatch ? titleMatch[1]!.replace(/ - .*$/, "").trim() : "";
433
+ const name = h1Name || title;
434
+
435
+ // Phone: poišči tel: povezavo
436
+ const phoneMatch = cleanHtml.match(/<a[^>]*href="tel:([^"]*)"[^>]*>([\s\S]*?)<\/a>/i);
437
+ const phone = phoneMatch ? normalize(phoneMatch[2] ?? phoneMatch[1]!) : "";
438
+
439
+ // Email: poišči mailto: povezavo
440
+ const emailMatch = cleanHtml.match(/<a[^>]*href="mailto:([^"]*)"[^>]*>([\s\S]*?)<\/a>/i);
441
+ const email = emailMatch ? normalize(emailMatch[2] ?? emailMatch[1]!) : "";
442
+
443
+ // Website: poišči v header delu (za h1, pred h5) link z URL-jem kot besedilom
444
+ let website = "";
445
+ const headerHtml = extractAfterH1("</h1>", "Matično podjetje");
446
+ // Poisci <a> brez href (aspNetDisabled) kjer je text URL, ali <a> s href
447
+ const headerLinks = Array.from(headerHtml.matchAll(/<a[^>]*>([\s\S]*?)<\/a>/gi));
448
+ for (const link of headerLinks) {
449
+ const text = normalize(link[1]!);
450
+ if (text.startsWith("http") && !text.includes("bizi.si")) {
451
+ website = text;
452
+ break;
453
+ }
454
+ }
455
+
456
+ // Address: vzemi samo vrstico z naslovom (vsebuje številko in poštno številko)
457
+ let address = "";
458
+ const addrRaw = extractAfterH1("</h1>", '<h5 class="b-box-title');
459
+ if (addrRaw) {
460
+ const addrText = normalize(stripTags(addrRaw));
461
+ // Vzemi besedilo do prvega emaila, telefona (3 cifre + presledek + 3-6 cifre), "Več" ali "Dodaj"
462
+ const upToMarker = addrText.split(
463
+ /\s+\S+@\S+|\s+\d{3}\s\d{3,6}|\s+Več\s+|\s+Dodaj\s+|\s+http/i,
464
+ )[0]?.trim() ?? '';
465
+ // Izlušči del s poštno številko (4-mestna)
466
+ const m = upToMarker.match(/([A-ZČŠŽa-zčšž][^,]+\d+[^,]*,\s*[^,]+,\s*\d{4}\s+[A-ZČŠŽa-zčšž][^,]+)/);
467
+ if (m) address = m[1]!.trim();
468
+ }
469
+
470
+ // --- Matično podjetje section (only after h1) ---
471
+ const maticnoHtml = extractAfterH1("Matično podjetje", "Ključne osebe");
472
+ const maticnoText = normalize(stripTags(maticnoHtml));
473
+
474
+ const davcnaM = maticnoText.match(/Davčna številka.*?(\d{7,9})/i);
475
+ const maticnaM = maticnoText.match(/Matična.*?(\d{7,10})/i);
476
+ const ddvM = maticnoText.match(/Zavezanec za DDV.*?(Da|Ne)/i);
477
+ const skisM = maticnoText.match(/SKIS\s*:?\s*([^\n]+?)(?:\s+(?:Datum|Vir))/i);
478
+ const vpisM = maticnoText.match(/Datum vpisa\s*:?\s*([^\n]+?)(?:\s+(?:Dejavnost|Vir))/i);
479
+ const dejavnostM = maticnoText.match(/Dejavnost TSmedia\s*:?\s*([^\n]+?)(?:\s+Vir)/i);
480
+
481
+ const taxNumber = davcnaM ? davcnaM[1]! : "";
482
+ const registrationNumber = maticnaM ? maticnaM[1]! : "";
483
+ const vatPayer = ddvM ? ddvM[1]!.toLowerCase() === "da" : false;
484
+ const vatNumM = fullText.match(/SI(\d{7,9})/);
485
+ const vatNumber = vatNumM ? "SI" + vatNumM[1]! : "";
486
+ const skis = skisM ? skisM[1]!.trim() : "";
487
+ const dateOfEntry = vpisM ? formatDate(vpisM[1]!.trim()) : "";
488
+ const activity = dejavnostM ? dejavnostM[1]!.trim() : "";
489
+
490
+ // --- TRR (only after h1) ---
491
+ const trrHtml = extractAfterH1("TRR in blokade", "Bizi obveščevalec");
492
+ const trrText = normalize(stripTags(trrHtml));
493
+ const bankAccounts: BankAccount[] = [];
494
+ const ibanRegex = /(SI\d{2}\s*\d{4}\s*\d{4}\s*\d{4}\s*\d{3})/g;
495
+ let ibanM: RegExpExecArray | null;
496
+ while ((ibanM = ibanRegex.exec(trrText)) !== null) {
497
+ bankAccounts.push({
498
+ iban: ibanM[1]!.replace(/\s+/g, " ").trim(),
499
+ status: "odprt",
500
+ openedDate: null,
501
+ bank: null,
502
+ });
503
+ }
504
+
505
+ // --- Key people (only after h1, before Bonitetna ocena) ---
506
+ const osebeHtml = extractAfterH1("Ključne osebe", "Bonitetna ocena");
507
+ const keyPeople = await this.#ocrKeyPeople(this.#parseKeyPeople(osebeHtml));
508
+
509
+ // --- Financiranje iz proračuna (only after h1) ---
510
+ const finHtml = extractAfterH1("Financiranje iz proračuna", "Ne spreglejte");
511
+ const finText = normalize(stripTags(finHtml));
512
+ const financingFromBudget = this.#parseFinancing(finText);
513
+
514
+ return {
515
+ name,
516
+ fullAddress: address || "",
517
+ phone,
518
+ website,
519
+ email,
520
+ taxNumber,
521
+ registrationNumber,
522
+ vatPayer,
523
+ vatNumber,
524
+ skis,
525
+ dateOfEntry,
526
+ activity,
527
+ keyPeople,
528
+ bankAccounts,
529
+ financingFromBudget,
530
+ };
531
+ }
532
+
533
+ // -----------------------------------------------------------------------
534
+ // Financial data
535
+ // -----------------------------------------------------------------------
536
+
537
+ async getFinancialData(relativeUrl: string): Promise<FinancialData[]> {
538
+ const basePath = relativeUrl.replace(/\/+$/, "");
539
+ const finUrl = `${BASE_URL}${basePath}/financni-podatki/`;
540
+
541
+ const res = await fetchWithCookies(finUrl, this.jar);
542
+ const html = await res.text();
543
+
544
+ // Preveri paywall
545
+ if (/registriraj.*se.*brezplačno|naroči.*polni.*dostop|preizkusi/i.test(html)) {
546
+ return [];
547
+ }
548
+
549
+ const cleanHtml = html
550
+ .replace(/<script[\s\S]*?<\/script>/gi, "")
551
+ .replace(/<style[\s\S]*?<\/style>/gi, "");
552
+
553
+ // Poišči tabelo s finančnimi podatki
554
+ const tables = parseTable(cleanHtml);
555
+ const data: FinancialData[] = [];
556
+
557
+ for (const row of tables) {
558
+ const joined = row.join(" ");
559
+ const yearMatch = joined.match(/\b(20[2-9]\d)\b/);
560
+ if (!yearMatch) continue;
561
+
562
+ const year = parseInt(yearMatch[1]!, 10);
563
+ // Prvi večji znesek = čisti prihodki, drugi = poslovni izzid
564
+ const amounts = row
565
+ .map((c) => parseSlovenianNumber(c))
566
+ .filter((n): n is number => n !== null && n > 0);
567
+
568
+ data.push({
569
+ year,
570
+ netSalesRevenue: amounts[0] ?? null,
571
+ operatingResult: amounts[1] ?? null,
572
+ });
573
+ }
574
+
575
+ return data;
576
+ }
577
+
578
+ // -----------------------------------------------------------------------
579
+ // Scrape
580
+ // -----------------------------------------------------------------------
581
+
582
+ async scrapeCompany(relativeUrl: string): Promise<ScrapeResult> {
583
+ const company = await this.getCompanyDetail(relativeUrl);
584
+ const financialData = await this.getFinancialData(relativeUrl);
585
+ return { company, financialData };
586
+ }
587
+
588
+ async scrape(
589
+ query: string,
590
+ options?: { maxResults?: number; includeFinancialData?: boolean },
591
+ ): Promise<ScrapeResult[]> {
592
+ const maxResults = options?.maxResults ?? 0;
593
+ const includeFinancial = options?.includeFinancialData ?? true;
594
+
595
+ const results = await this.search(query);
596
+ const limited = maxResults > 0 ? results.slice(0, maxResults) : results;
597
+
598
+ const output: ScrapeResult[] = [];
599
+ for (const result of limited) {
600
+ try {
601
+ const scraped = await this.scrapeCompany(result.url);
602
+ output.push(scraped);
603
+ } catch (err) {
604
+ console.error(`Failed to scrape ${result.name}:`, err);
605
+ output.push({
606
+ company: {
607
+ name: result.name,
608
+ fullAddress: "",
609
+ phone: "",
610
+ website: "",
611
+ email: "",
612
+ taxNumber: result.taxNumber,
613
+ registrationNumber: result.registrationNumber,
614
+ vatPayer: false,
615
+ vatNumber: "",
616
+ skis: "",
617
+ dateOfEntry: "",
618
+ activity: result.activity,
619
+ keyPeople: { nadzorniki: [], ustanovitelji: [], zastopniki: [] },
620
+ bankAccounts: [],
621
+ financingFromBudget: {},
622
+ },
623
+ financialData: [],
624
+ });
625
+ }
626
+ }
627
+
628
+ return output;
629
+ }
630
+
631
+ // -----------------------------------------------------------------------
632
+ // Private helpers
633
+ // -----------------------------------------------------------------------
634
+
635
+ /** Statusna sporočila, ki niso imena oseb. */
636
+ static #statusPhrases = [
637
+ 'ni vpisan', 'ni vpisanih', 'ni aktivnih',
638
+ 'trenutno ni', 'ni podatka', 'ni zabeleženih',
639
+ 'ni najdenih', 'brez podatka',
640
+ ];
641
+
642
+ static #isStatus(text: string): boolean {
643
+ const lower = text.toLowerCase();
644
+ return BizisiScraper.#statusPhrases.some((p) => lower.includes(p));
645
+ }
646
+
647
+ /**
648
+ * Razčleni HTML sekcije "Ključne osebe" in vrne KeyPeople z ImageGenerator URL-ji.
649
+ * Vsaka oseba je v svojem <span class="...b-attr-value...">.
650
+ */
651
+ #parseKeyPeople(html: string): KeyPeople {
652
+ const result: KeyPeople = { nadzorniki: [], ustanovitelji: [], zastopniki: [] };
653
+ if (!html) return result;
654
+
655
+ const cats = [
656
+ { label: 'Nadzorniki:', key: 'nadzorniki' as const },
657
+ { label: 'Ustanovitelji:', key: 'ustanovitelji' as const },
658
+ { label: 'Zastopniki:', key: 'zastopniki' as const },
659
+ ];
660
+
661
+ for (let ci = 0; ci < cats.length; ci++) {
662
+ const { label, key } = cats[ci]!;
663
+ const labelIdx = html.indexOf(label);
664
+ if (labelIdx < 0) continue;
665
+
666
+ const nextLabel = ci + 1 < cats.length ? cats[ci + 1]!.label : null;
667
+ const nextIdx = nextLabel ? html.indexOf(nextLabel, labelIdx + label.length) : html.length;
668
+ const sectionHtml = html.substring(labelIdx + label.length, nextIdx > 0 ? nextIdx : html.length);
669
+
670
+ // Vsak <span class="...b-attr-value..."> je ena oseba z 1+ slikami
671
+ const personSpans = Array.from(
672
+ sectionHtml.matchAll(/<span[^>]*class="[^"]*b-attr-value[^"]*"[^>]*>([\s\S]*?)<\/span>/gi),
673
+ );
674
+
675
+ for (const span of personSpans) {
676
+ const imgs = Array.from(span[1]!.matchAll(/<img[^>]*src="([^"]*ImageGenerator[^"]*)"[^>]*>/gi));
677
+ const urls = imgs.map((m) => m[1]!);
678
+ if (urls.length > 0) result[key].push(urls.join('|||')); // delimiter za OCR
679
+ }
680
+ }
681
+
682
+ return result;
683
+ }
684
+
685
+ async #ocrKeyPeople(raw: KeyPeople): Promise<KeyPeople> {
686
+ const result: KeyPeople = { nadzorniki: [], ustanovitelji: [], zastopniki: [] };
687
+
688
+ for (const group of ['nadzorniki', 'ustanovitelji', 'zastopniki'] as const) {
689
+ for (const entry of raw[group]) {
690
+ const urls = entry.split('|||');
691
+ if (urls.length === 0) continue;
692
+
693
+ const texts = await Promise.all(urls.map((url: string) => this.#ocrImage(url)));
694
+ const joined = texts.join(' ').trim();
695
+
696
+ if (!joined || BizisiScraper.#isStatus(joined)) continue;
697
+
698
+ // Odstrani vloge (direktor, prokurist, ...)
699
+ const cleaned = joined.replace(/\s*,?\s*(?:direktor|prokurist|član|predsednik|namestnik|vodja)\s*,?\s*/gi, '').trim();
700
+ if (cleaned) result[group].push(cleaned);
701
+ }
702
+ }
703
+
704
+ return result;
705
+ }
706
+
707
+ #parseFinancing(text: string): Record<number, number> {
708
+ const result: Record<number, number> = {};
709
+ const regex = /\b(20[0-9]{2})\D{1,20}?(\d{1,3}(?:\.\d{3})*(?:,\d{2})?)\s*(?:EUR|€)?/g;
710
+ let m: RegExpExecArray | null;
711
+ while ((m = regex.exec(text)) !== null) {
712
+ const year = parseInt(m[1]!, 10);
713
+ const amount = parseSlovenianNumber(m[2] ?? null);
714
+ if (year >= 2010 && year <= 2030 && amount !== null) {
715
+ result[year] = amount;
716
+ }
717
+ }
718
+ return result;
719
+ }
720
+ }