@mendable/firecrawl 4.8.0 → 4.8.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-SP54QQ5D.js → chunk-3IN25WJ7.js} +1 -3
- package/dist/index.cjs +86 -11
- package/dist/index.d.cts +4 -0
- package/dist/index.d.ts +4 -0
- package/dist/index.js +87 -10
- package/dist/{package-MR6WWGRM.js → package-UXOZB356.js} +1 -1
- package/package.json +1 -3
- package/src/types/node-undici.d.ts +10 -0
- package/src/v1/index.ts +1 -0
- package/src/v2/types.ts +1 -0
- package/src/v2/watcher.ts +102 -9
- package/tsup.config.ts +0 -1
|
@@ -8,7 +8,7 @@ var require_package = __commonJS({
|
|
|
8
8
|
"package.json"(exports, module) {
|
|
9
9
|
module.exports = {
|
|
10
10
|
name: "@mendable/firecrawl-js",
|
|
11
|
-
version: "4.8.
|
|
11
|
+
version: "4.8.2",
|
|
12
12
|
description: "JavaScript SDK for Firecrawl API",
|
|
13
13
|
main: "dist/index.js",
|
|
14
14
|
types: "dist/index.d.ts",
|
|
@@ -36,7 +36,6 @@ var require_package = __commonJS({
|
|
|
36
36
|
dependencies: {
|
|
37
37
|
axios: "^1.12.2",
|
|
38
38
|
"typescript-event-target": "^1.1.1",
|
|
39
|
-
ws: "^8.18.3",
|
|
40
39
|
zod: "^3.23.8",
|
|
41
40
|
"zod-to-json-schema": "^3.23.0"
|
|
42
41
|
},
|
|
@@ -51,7 +50,6 @@ var require_package = __commonJS({
|
|
|
51
50
|
"@types/mocha": "^10.0.6",
|
|
52
51
|
"@types/node": "^20.12.12",
|
|
53
52
|
"@types/uuid": "^9.0.8",
|
|
54
|
-
"@types/ws": "^8.18.1",
|
|
55
53
|
dotenv: "^16.4.5",
|
|
56
54
|
jest: "^30.2.0",
|
|
57
55
|
"ts-jest": "^29.4.5",
|
package/dist/index.cjs
CHANGED
|
@@ -35,7 +35,7 @@ var require_package = __commonJS({
|
|
|
35
35
|
"package.json"(exports2, module2) {
|
|
36
36
|
module2.exports = {
|
|
37
37
|
name: "@mendable/firecrawl-js",
|
|
38
|
-
version: "4.8.
|
|
38
|
+
version: "4.8.2",
|
|
39
39
|
description: "JavaScript SDK for Firecrawl API",
|
|
40
40
|
main: "dist/index.js",
|
|
41
41
|
types: "dist/index.d.ts",
|
|
@@ -63,7 +63,6 @@ var require_package = __commonJS({
|
|
|
63
63
|
dependencies: {
|
|
64
64
|
axios: "^1.12.2",
|
|
65
65
|
"typescript-event-target": "^1.1.1",
|
|
66
|
-
ws: "^8.18.3",
|
|
67
66
|
zod: "^3.23.8",
|
|
68
67
|
"zod-to-json-schema": "^3.23.0"
|
|
69
68
|
},
|
|
@@ -78,7 +77,6 @@ var require_package = __commonJS({
|
|
|
78
77
|
"@types/mocha": "^10.0.6",
|
|
79
78
|
"@types/node": "^20.12.12",
|
|
80
79
|
"@types/uuid": "^9.0.8",
|
|
81
|
-
"@types/ws": "^8.18.1",
|
|
82
80
|
dotenv: "^16.4.5",
|
|
83
81
|
jest: "^30.2.0",
|
|
84
82
|
"ts-jest": "^29.4.5",
|
|
@@ -832,7 +830,57 @@ async function getTokenUsageHistorical(http, byApiKey) {
|
|
|
832
830
|
|
|
833
831
|
// src/v2/watcher.ts
|
|
834
832
|
var import_events = require("events");
|
|
835
|
-
var
|
|
833
|
+
var hasGlobalWebSocket = () => {
|
|
834
|
+
if (typeof globalThis === "undefined") return void 0;
|
|
835
|
+
const candidate = globalThis.WebSocket;
|
|
836
|
+
return typeof candidate === "function" ? candidate : void 0;
|
|
837
|
+
};
|
|
838
|
+
var isNodeRuntime = () => typeof process !== "undefined" && !!process.versions?.node;
|
|
839
|
+
var cachedWebSocket;
|
|
840
|
+
var loadPromise;
|
|
841
|
+
var loadNodeWebSocket = async () => {
|
|
842
|
+
if (!isNodeRuntime()) return void 0;
|
|
843
|
+
try {
|
|
844
|
+
const undici = await import("undici");
|
|
845
|
+
const ctor = undici.WebSocket ?? undici.default?.WebSocket;
|
|
846
|
+
return typeof ctor === "function" ? ctor : void 0;
|
|
847
|
+
} catch {
|
|
848
|
+
return void 0;
|
|
849
|
+
}
|
|
850
|
+
};
|
|
851
|
+
var getWebSocketCtor = async () => {
|
|
852
|
+
if (cachedWebSocket) return cachedWebSocket;
|
|
853
|
+
const globalWs = hasGlobalWebSocket();
|
|
854
|
+
if (globalWs) {
|
|
855
|
+
cachedWebSocket = globalWs;
|
|
856
|
+
return cachedWebSocket;
|
|
857
|
+
}
|
|
858
|
+
if (!loadPromise) {
|
|
859
|
+
loadPromise = loadNodeWebSocket();
|
|
860
|
+
}
|
|
861
|
+
cachedWebSocket = await loadPromise;
|
|
862
|
+
return cachedWebSocket;
|
|
863
|
+
};
|
|
864
|
+
var decoder = typeof TextDecoder !== "undefined" ? new TextDecoder() : void 0;
|
|
865
|
+
var ensureUtf8String = (data) => {
|
|
866
|
+
if (typeof data === "string") return data;
|
|
867
|
+
if (typeof Buffer !== "undefined" && Buffer.isBuffer(data)) {
|
|
868
|
+
return data.toString("utf8");
|
|
869
|
+
}
|
|
870
|
+
const convertView = (view) => {
|
|
871
|
+
if (typeof Buffer !== "undefined") {
|
|
872
|
+
return Buffer.from(view.buffer, view.byteOffset, view.byteLength).toString("utf8");
|
|
873
|
+
}
|
|
874
|
+
return decoder?.decode(view);
|
|
875
|
+
};
|
|
876
|
+
if (ArrayBuffer.isView(data)) {
|
|
877
|
+
return convertView(data);
|
|
878
|
+
}
|
|
879
|
+
if (data instanceof ArrayBuffer) {
|
|
880
|
+
return convertView(new Uint8Array(data));
|
|
881
|
+
}
|
|
882
|
+
return void 0;
|
|
883
|
+
};
|
|
836
884
|
var Watcher = class extends import_events.EventEmitter {
|
|
837
885
|
http;
|
|
838
886
|
jobId;
|
|
@@ -841,6 +889,7 @@ var Watcher = class extends import_events.EventEmitter {
|
|
|
841
889
|
timeout;
|
|
842
890
|
ws;
|
|
843
891
|
closed = false;
|
|
892
|
+
emittedDocumentKeys = /* @__PURE__ */ new Set();
|
|
844
893
|
constructor(http, jobId, opts = {}) {
|
|
845
894
|
super();
|
|
846
895
|
this.http = http;
|
|
@@ -858,10 +907,14 @@ var Watcher = class extends import_events.EventEmitter {
|
|
|
858
907
|
async start() {
|
|
859
908
|
try {
|
|
860
909
|
const url = this.buildWsUrl();
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
|
|
864
|
-
|
|
910
|
+
const wsCtor = await getWebSocketCtor();
|
|
911
|
+
if (!wsCtor) {
|
|
912
|
+
this.pollLoop();
|
|
913
|
+
return;
|
|
914
|
+
}
|
|
915
|
+
this.ws = new wsCtor(url, this.http.getApiKey());
|
|
916
|
+
if (this.ws && "binaryType" in this.ws) {
|
|
917
|
+
this.ws.binaryType = "arraybuffer";
|
|
865
918
|
}
|
|
866
919
|
if (this.ws) {
|
|
867
920
|
this.attachWsHandlers(this.ws);
|
|
@@ -875,8 +928,9 @@ var Watcher = class extends import_events.EventEmitter {
|
|
|
875
928
|
const timeoutMs = this.timeout ? this.timeout * 1e3 : void 0;
|
|
876
929
|
ws.onmessage = (ev) => {
|
|
877
930
|
try {
|
|
878
|
-
const
|
|
879
|
-
if (!
|
|
931
|
+
const raw = ensureUtf8String(ev.data);
|
|
932
|
+
if (!raw) return;
|
|
933
|
+
const body = JSON.parse(raw);
|
|
880
934
|
const type = body.type;
|
|
881
935
|
if (type === "error") {
|
|
882
936
|
this.emit("error", { status: "failed", data: [], error: body.error, id: this.jobId });
|
|
@@ -896,6 +950,7 @@ var Watcher = class extends import_events.EventEmitter {
|
|
|
896
950
|
if (type === "done") {
|
|
897
951
|
const payload2 = body.data || body;
|
|
898
952
|
const data = payload2.data || [];
|
|
953
|
+
if (data.length) this.emitDocuments(data);
|
|
899
954
|
this.emit("done", { status: "completed", data, id: this.jobId });
|
|
900
955
|
this.close();
|
|
901
956
|
return;
|
|
@@ -914,8 +969,27 @@ var Watcher = class extends import_events.EventEmitter {
|
|
|
914
969
|
if (!this.closed) this.pollLoop();
|
|
915
970
|
};
|
|
916
971
|
}
|
|
972
|
+
documentKey(doc) {
|
|
973
|
+
if (doc && typeof doc === "object") {
|
|
974
|
+
const explicitId = doc.id ?? doc.docId ?? doc.url;
|
|
975
|
+
if (typeof explicitId === "string" && explicitId.length) {
|
|
976
|
+
return explicitId;
|
|
977
|
+
}
|
|
978
|
+
}
|
|
979
|
+
try {
|
|
980
|
+
return JSON.stringify(doc);
|
|
981
|
+
} catch {
|
|
982
|
+
return `${Date.now()}-${Math.random()}`;
|
|
983
|
+
}
|
|
984
|
+
}
|
|
917
985
|
emitDocuments(docs) {
|
|
918
|
-
for (const doc of docs)
|
|
986
|
+
for (const doc of docs) {
|
|
987
|
+
if (!doc) continue;
|
|
988
|
+
const key = this.documentKey(doc);
|
|
989
|
+
if (this.emittedDocumentKeys.has(key)) continue;
|
|
990
|
+
this.emittedDocumentKeys.add(key);
|
|
991
|
+
this.emit("document", { ...doc, id: this.jobId });
|
|
992
|
+
}
|
|
919
993
|
}
|
|
920
994
|
emitSnapshot(payload) {
|
|
921
995
|
const status = payload.status;
|
|
@@ -951,6 +1025,7 @@ var Watcher = class extends import_events.EventEmitter {
|
|
|
951
1025
|
while (!this.closed) {
|
|
952
1026
|
try {
|
|
953
1027
|
const snap = this.kind === "crawl" ? await getCrawlStatus(this.http, this.jobId) : await getBatchScrapeStatus(this.http, this.jobId);
|
|
1028
|
+
this.emitDocuments(snap.data || []);
|
|
954
1029
|
this.emit("snapshot", snap);
|
|
955
1030
|
if (["completed", "failed", "cancelled"].includes(snap.status)) {
|
|
956
1031
|
this.emit("done", { status: snap.status, data: snap.data, id: this.jobId });
|
package/dist/index.d.cts
CHANGED
|
@@ -444,6 +444,7 @@ interface ExtractResponse$1 {
|
|
|
444
444
|
warning?: string;
|
|
445
445
|
sources?: Record<string, unknown>;
|
|
446
446
|
expiresAt?: string;
|
|
447
|
+
creditsUsed?: number;
|
|
447
448
|
}
|
|
448
449
|
interface AgentOptions$1 {
|
|
449
450
|
model: 'FIRE-1';
|
|
@@ -578,10 +579,12 @@ declare class Watcher extends EventEmitter {
|
|
|
578
579
|
private readonly timeout?;
|
|
579
580
|
private ws?;
|
|
580
581
|
private closed;
|
|
582
|
+
private readonly emittedDocumentKeys;
|
|
581
583
|
constructor(http: HttpClient, jobId: string, opts?: WatcherOptions);
|
|
582
584
|
private buildWsUrl;
|
|
583
585
|
start(): Promise<void>;
|
|
584
586
|
private attachWsHandlers;
|
|
587
|
+
private documentKey;
|
|
585
588
|
private emitDocuments;
|
|
586
589
|
private emitSnapshot;
|
|
587
590
|
private pollLoop;
|
|
@@ -1098,6 +1101,7 @@ interface ExtractResponse<LLMSchema extends zt.ZodSchema = any> {
|
|
|
1098
1101
|
error?: string;
|
|
1099
1102
|
warning?: string;
|
|
1100
1103
|
sources?: string[];
|
|
1104
|
+
creditsUsed?: number;
|
|
1101
1105
|
}
|
|
1102
1106
|
/**
|
|
1103
1107
|
* Error response interface.
|
package/dist/index.d.ts
CHANGED
|
@@ -444,6 +444,7 @@ interface ExtractResponse$1 {
|
|
|
444
444
|
warning?: string;
|
|
445
445
|
sources?: Record<string, unknown>;
|
|
446
446
|
expiresAt?: string;
|
|
447
|
+
creditsUsed?: number;
|
|
447
448
|
}
|
|
448
449
|
interface AgentOptions$1 {
|
|
449
450
|
model: 'FIRE-1';
|
|
@@ -578,10 +579,12 @@ declare class Watcher extends EventEmitter {
|
|
|
578
579
|
private readonly timeout?;
|
|
579
580
|
private ws?;
|
|
580
581
|
private closed;
|
|
582
|
+
private readonly emittedDocumentKeys;
|
|
581
583
|
constructor(http: HttpClient, jobId: string, opts?: WatcherOptions);
|
|
582
584
|
private buildWsUrl;
|
|
583
585
|
start(): Promise<void>;
|
|
584
586
|
private attachWsHandlers;
|
|
587
|
+
private documentKey;
|
|
585
588
|
private emitDocuments;
|
|
586
589
|
private emitSnapshot;
|
|
587
590
|
private pollLoop;
|
|
@@ -1098,6 +1101,7 @@ interface ExtractResponse<LLMSchema extends zt.ZodSchema = any> {
|
|
|
1098
1101
|
error?: string;
|
|
1099
1102
|
warning?: string;
|
|
1100
1103
|
sources?: string[];
|
|
1104
|
+
creditsUsed?: number;
|
|
1101
1105
|
}
|
|
1102
1106
|
/**
|
|
1103
1107
|
* Error response interface.
|
package/dist/index.js
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import {
|
|
2
2
|
require_package
|
|
3
|
-
} from "./chunk-
|
|
3
|
+
} from "./chunk-3IN25WJ7.js";
|
|
4
4
|
|
|
5
5
|
// src/v2/utils/httpClient.ts
|
|
6
6
|
import axios from "axios";
|
|
@@ -712,7 +712,57 @@ async function getTokenUsageHistorical(http, byApiKey) {
|
|
|
712
712
|
|
|
713
713
|
// src/v2/watcher.ts
|
|
714
714
|
import { EventEmitter } from "events";
|
|
715
|
-
|
|
715
|
+
var hasGlobalWebSocket = () => {
|
|
716
|
+
if (typeof globalThis === "undefined") return void 0;
|
|
717
|
+
const candidate = globalThis.WebSocket;
|
|
718
|
+
return typeof candidate === "function" ? candidate : void 0;
|
|
719
|
+
};
|
|
720
|
+
var isNodeRuntime = () => typeof process !== "undefined" && !!process.versions?.node;
|
|
721
|
+
var cachedWebSocket;
|
|
722
|
+
var loadPromise;
|
|
723
|
+
var loadNodeWebSocket = async () => {
|
|
724
|
+
if (!isNodeRuntime()) return void 0;
|
|
725
|
+
try {
|
|
726
|
+
const undici = await import("undici");
|
|
727
|
+
const ctor = undici.WebSocket ?? undici.default?.WebSocket;
|
|
728
|
+
return typeof ctor === "function" ? ctor : void 0;
|
|
729
|
+
} catch {
|
|
730
|
+
return void 0;
|
|
731
|
+
}
|
|
732
|
+
};
|
|
733
|
+
var getWebSocketCtor = async () => {
|
|
734
|
+
if (cachedWebSocket) return cachedWebSocket;
|
|
735
|
+
const globalWs = hasGlobalWebSocket();
|
|
736
|
+
if (globalWs) {
|
|
737
|
+
cachedWebSocket = globalWs;
|
|
738
|
+
return cachedWebSocket;
|
|
739
|
+
}
|
|
740
|
+
if (!loadPromise) {
|
|
741
|
+
loadPromise = loadNodeWebSocket();
|
|
742
|
+
}
|
|
743
|
+
cachedWebSocket = await loadPromise;
|
|
744
|
+
return cachedWebSocket;
|
|
745
|
+
};
|
|
746
|
+
var decoder = typeof TextDecoder !== "undefined" ? new TextDecoder() : void 0;
|
|
747
|
+
var ensureUtf8String = (data) => {
|
|
748
|
+
if (typeof data === "string") return data;
|
|
749
|
+
if (typeof Buffer !== "undefined" && Buffer.isBuffer(data)) {
|
|
750
|
+
return data.toString("utf8");
|
|
751
|
+
}
|
|
752
|
+
const convertView = (view) => {
|
|
753
|
+
if (typeof Buffer !== "undefined") {
|
|
754
|
+
return Buffer.from(view.buffer, view.byteOffset, view.byteLength).toString("utf8");
|
|
755
|
+
}
|
|
756
|
+
return decoder?.decode(view);
|
|
757
|
+
};
|
|
758
|
+
if (ArrayBuffer.isView(data)) {
|
|
759
|
+
return convertView(data);
|
|
760
|
+
}
|
|
761
|
+
if (data instanceof ArrayBuffer) {
|
|
762
|
+
return convertView(new Uint8Array(data));
|
|
763
|
+
}
|
|
764
|
+
return void 0;
|
|
765
|
+
};
|
|
716
766
|
var Watcher = class extends EventEmitter {
|
|
717
767
|
http;
|
|
718
768
|
jobId;
|
|
@@ -721,6 +771,7 @@ var Watcher = class extends EventEmitter {
|
|
|
721
771
|
timeout;
|
|
722
772
|
ws;
|
|
723
773
|
closed = false;
|
|
774
|
+
emittedDocumentKeys = /* @__PURE__ */ new Set();
|
|
724
775
|
constructor(http, jobId, opts = {}) {
|
|
725
776
|
super();
|
|
726
777
|
this.http = http;
|
|
@@ -738,10 +789,14 @@ var Watcher = class extends EventEmitter {
|
|
|
738
789
|
async start() {
|
|
739
790
|
try {
|
|
740
791
|
const url = this.buildWsUrl();
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
792
|
+
const wsCtor = await getWebSocketCtor();
|
|
793
|
+
if (!wsCtor) {
|
|
794
|
+
this.pollLoop();
|
|
795
|
+
return;
|
|
796
|
+
}
|
|
797
|
+
this.ws = new wsCtor(url, this.http.getApiKey());
|
|
798
|
+
if (this.ws && "binaryType" in this.ws) {
|
|
799
|
+
this.ws.binaryType = "arraybuffer";
|
|
745
800
|
}
|
|
746
801
|
if (this.ws) {
|
|
747
802
|
this.attachWsHandlers(this.ws);
|
|
@@ -755,8 +810,9 @@ var Watcher = class extends EventEmitter {
|
|
|
755
810
|
const timeoutMs = this.timeout ? this.timeout * 1e3 : void 0;
|
|
756
811
|
ws.onmessage = (ev) => {
|
|
757
812
|
try {
|
|
758
|
-
const
|
|
759
|
-
if (!
|
|
813
|
+
const raw = ensureUtf8String(ev.data);
|
|
814
|
+
if (!raw) return;
|
|
815
|
+
const body = JSON.parse(raw);
|
|
760
816
|
const type = body.type;
|
|
761
817
|
if (type === "error") {
|
|
762
818
|
this.emit("error", { status: "failed", data: [], error: body.error, id: this.jobId });
|
|
@@ -776,6 +832,7 @@ var Watcher = class extends EventEmitter {
|
|
|
776
832
|
if (type === "done") {
|
|
777
833
|
const payload2 = body.data || body;
|
|
778
834
|
const data = payload2.data || [];
|
|
835
|
+
if (data.length) this.emitDocuments(data);
|
|
779
836
|
this.emit("done", { status: "completed", data, id: this.jobId });
|
|
780
837
|
this.close();
|
|
781
838
|
return;
|
|
@@ -794,8 +851,27 @@ var Watcher = class extends EventEmitter {
|
|
|
794
851
|
if (!this.closed) this.pollLoop();
|
|
795
852
|
};
|
|
796
853
|
}
|
|
854
|
+
documentKey(doc) {
|
|
855
|
+
if (doc && typeof doc === "object") {
|
|
856
|
+
const explicitId = doc.id ?? doc.docId ?? doc.url;
|
|
857
|
+
if (typeof explicitId === "string" && explicitId.length) {
|
|
858
|
+
return explicitId;
|
|
859
|
+
}
|
|
860
|
+
}
|
|
861
|
+
try {
|
|
862
|
+
return JSON.stringify(doc);
|
|
863
|
+
} catch {
|
|
864
|
+
return `${Date.now()}-${Math.random()}`;
|
|
865
|
+
}
|
|
866
|
+
}
|
|
797
867
|
emitDocuments(docs) {
|
|
798
|
-
for (const doc of docs)
|
|
868
|
+
for (const doc of docs) {
|
|
869
|
+
if (!doc) continue;
|
|
870
|
+
const key = this.documentKey(doc);
|
|
871
|
+
if (this.emittedDocumentKeys.has(key)) continue;
|
|
872
|
+
this.emittedDocumentKeys.add(key);
|
|
873
|
+
this.emit("document", { ...doc, id: this.jobId });
|
|
874
|
+
}
|
|
799
875
|
}
|
|
800
876
|
emitSnapshot(payload) {
|
|
801
877
|
const status = payload.status;
|
|
@@ -831,6 +907,7 @@ var Watcher = class extends EventEmitter {
|
|
|
831
907
|
while (!this.closed) {
|
|
832
908
|
try {
|
|
833
909
|
const snap = this.kind === "crawl" ? await getCrawlStatus(this.http, this.jobId) : await getBatchScrapeStatus(this.http, this.jobId);
|
|
910
|
+
this.emitDocuments(snap.data || []);
|
|
834
911
|
this.emit("snapshot", snap);
|
|
835
912
|
if (["completed", "failed", "cancelled"].includes(snap.status)) {
|
|
836
913
|
this.emit("done", { status: snap.status, data: snap.data, id: this.jobId });
|
|
@@ -1087,7 +1164,7 @@ var FirecrawlApp = class {
|
|
|
1087
1164
|
if (typeof process !== "undefined" && process.env && process.env.npm_package_version) {
|
|
1088
1165
|
return process.env.npm_package_version;
|
|
1089
1166
|
}
|
|
1090
|
-
const packageJson = await import("./package-
|
|
1167
|
+
const packageJson = await import("./package-UXOZB356.js");
|
|
1091
1168
|
return packageJson.default.version;
|
|
1092
1169
|
} catch (error) {
|
|
1093
1170
|
const isTest = typeof process !== "undefined" && (process.env.JEST_WORKER_ID != null || false);
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@mendable/firecrawl",
|
|
3
|
-
"version": "4.8.
|
|
3
|
+
"version": "4.8.2",
|
|
4
4
|
"description": "JavaScript SDK for Firecrawl API",
|
|
5
5
|
"main": "dist/index.js",
|
|
6
6
|
"types": "dist/index.d.ts",
|
|
@@ -28,7 +28,6 @@
|
|
|
28
28
|
"dependencies": {
|
|
29
29
|
"axios": "^1.12.2",
|
|
30
30
|
"typescript-event-target": "^1.1.1",
|
|
31
|
-
"ws": "^8.18.3",
|
|
32
31
|
"zod": "^3.23.8",
|
|
33
32
|
"zod-to-json-schema": "^3.23.0"
|
|
34
33
|
},
|
|
@@ -43,7 +42,6 @@
|
|
|
43
42
|
"@types/mocha": "^10.0.6",
|
|
44
43
|
"@types/node": "^20.12.12",
|
|
45
44
|
"@types/uuid": "^9.0.8",
|
|
46
|
-
"@types/ws": "^8.18.1",
|
|
47
45
|
"dotenv": "^16.4.5",
|
|
48
46
|
"jest": "^30.2.0",
|
|
49
47
|
"ts-jest": "^29.4.5",
|
package/src/v1/index.ts
CHANGED
package/src/v2/types.ts
CHANGED
package/src/v2/watcher.ts
CHANGED
|
@@ -3,7 +3,73 @@ import type { BatchScrapeJob, CrawlJob, Document } from "./types";
|
|
|
3
3
|
import type { HttpClient } from "./utils/httpClient";
|
|
4
4
|
import { getBatchScrapeStatus } from "./methods/batch";
|
|
5
5
|
import { getCrawlStatus } from "./methods/crawl";
|
|
6
|
-
|
|
6
|
+
// Note: browsers/Deno expose globalThis.WebSocket, but many Node runtimes (<22.4 or without
|
|
7
|
+
// experimental flags) do not. We lazily fall back to node:undici.
|
|
8
|
+
|
|
9
|
+
type WebSocketConstructor = new (url: string, protocols?: string | string[]) => WebSocket;
|
|
10
|
+
|
|
11
|
+
const hasGlobalWebSocket = (): WebSocketConstructor | undefined => {
|
|
12
|
+
if (typeof globalThis === "undefined") return undefined;
|
|
13
|
+
const candidate = (globalThis as any).WebSocket;
|
|
14
|
+
return typeof candidate === "function" ? (candidate as WebSocketConstructor) : undefined;
|
|
15
|
+
};
|
|
16
|
+
|
|
17
|
+
const isNodeRuntime = () => typeof process !== "undefined" && !!process.versions?.node;
|
|
18
|
+
|
|
19
|
+
let cachedWebSocket: WebSocketConstructor | undefined;
|
|
20
|
+
let loadPromise: Promise<WebSocketConstructor | undefined> | undefined;
|
|
21
|
+
|
|
22
|
+
const loadNodeWebSocket = async (): Promise<WebSocketConstructor | undefined> => {
|
|
23
|
+
if (!isNodeRuntime()) return undefined;
|
|
24
|
+
try {
|
|
25
|
+
const undici = await import("node:undici");
|
|
26
|
+
const ctor = (undici as any).WebSocket ?? (undici as any).default?.WebSocket;
|
|
27
|
+
return typeof ctor === "function" ? (ctor as WebSocketConstructor) : undefined;
|
|
28
|
+
} catch {
|
|
29
|
+
return undefined;
|
|
30
|
+
}
|
|
31
|
+
};
|
|
32
|
+
|
|
33
|
+
const getWebSocketCtor = async (): Promise<WebSocketConstructor | undefined> => {
|
|
34
|
+
if (cachedWebSocket) return cachedWebSocket;
|
|
35
|
+
const globalWs = hasGlobalWebSocket();
|
|
36
|
+
if (globalWs) {
|
|
37
|
+
cachedWebSocket = globalWs;
|
|
38
|
+
return cachedWebSocket;
|
|
39
|
+
}
|
|
40
|
+
if (!loadPromise) {
|
|
41
|
+
loadPromise = loadNodeWebSocket();
|
|
42
|
+
}
|
|
43
|
+
cachedWebSocket = await loadPromise;
|
|
44
|
+
return cachedWebSocket;
|
|
45
|
+
};
|
|
46
|
+
|
|
47
|
+
const decoder = typeof TextDecoder !== "undefined" ? new TextDecoder() : undefined;
|
|
48
|
+
|
|
49
|
+
const ensureUtf8String = (data: unknown): string | undefined => {
|
|
50
|
+
if (typeof data === "string") return data;
|
|
51
|
+
|
|
52
|
+
if (typeof Buffer !== "undefined" && Buffer.isBuffer(data)) {
|
|
53
|
+
return data.toString("utf8");
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
const convertView = (view: ArrayBufferView): string | undefined => {
|
|
57
|
+
if (typeof Buffer !== "undefined") {
|
|
58
|
+
return Buffer.from(view.buffer, view.byteOffset, view.byteLength).toString("utf8");
|
|
59
|
+
}
|
|
60
|
+
return decoder?.decode(view);
|
|
61
|
+
};
|
|
62
|
+
|
|
63
|
+
if (ArrayBuffer.isView(data)) {
|
|
64
|
+
return convertView(data);
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
if (data instanceof ArrayBuffer) {
|
|
68
|
+
return convertView(new Uint8Array(data));
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
return undefined;
|
|
72
|
+
};
|
|
7
73
|
|
|
8
74
|
type JobKind = "crawl" | "batch";
|
|
9
75
|
|
|
@@ -23,6 +89,7 @@ export class Watcher extends EventEmitter {
|
|
|
23
89
|
private readonly timeout?: number;
|
|
24
90
|
private ws?: WebSocket;
|
|
25
91
|
private closed = false;
|
|
92
|
+
private readonly emittedDocumentKeys = new Set<string>();
|
|
26
93
|
|
|
27
94
|
constructor(http: HttpClient, jobId: string, opts: WatcherOptions = {}) {
|
|
28
95
|
super();
|
|
@@ -44,11 +111,14 @@ export class Watcher extends EventEmitter {
|
|
|
44
111
|
async start(): Promise<void> {
|
|
45
112
|
try {
|
|
46
113
|
const url = this.buildWsUrl();
|
|
47
|
-
|
|
48
|
-
if (
|
|
49
|
-
this.
|
|
50
|
-
|
|
51
|
-
|
|
114
|
+
const wsCtor = await getWebSocketCtor();
|
|
115
|
+
if (!wsCtor) {
|
|
116
|
+
this.pollLoop();
|
|
117
|
+
return;
|
|
118
|
+
}
|
|
119
|
+
this.ws = new wsCtor(url, this.http.getApiKey()) as any;
|
|
120
|
+
if (this.ws && "binaryType" in this.ws) {
|
|
121
|
+
(this.ws as any).binaryType = "arraybuffer";
|
|
52
122
|
}
|
|
53
123
|
|
|
54
124
|
if (this.ws) {
|
|
@@ -64,8 +134,9 @@ export class Watcher extends EventEmitter {
|
|
|
64
134
|
const timeoutMs = this.timeout ? this.timeout * 1000 : undefined;
|
|
65
135
|
ws.onmessage = (ev: MessageEvent) => {
|
|
66
136
|
try {
|
|
67
|
-
const
|
|
68
|
-
if (!
|
|
137
|
+
const raw = ensureUtf8String(ev.data);
|
|
138
|
+
if (!raw) return;
|
|
139
|
+
const body = JSON.parse(raw);
|
|
69
140
|
const type = body.type as string | undefined;
|
|
70
141
|
if (type === "error") {
|
|
71
142
|
this.emit("error", { status: "failed", data: [], error: body.error, id: this.jobId });
|
|
@@ -85,6 +156,7 @@ export class Watcher extends EventEmitter {
|
|
|
85
156
|
if (type === "done") {
|
|
86
157
|
const payload = body.data || body;
|
|
87
158
|
const data = (payload.data || []) as Document[];
|
|
159
|
+
if (data.length) this.emitDocuments(data);
|
|
88
160
|
this.emit("done", { status: "completed", data, id: this.jobId });
|
|
89
161
|
this.close();
|
|
90
162
|
return;
|
|
@@ -105,8 +177,28 @@ export class Watcher extends EventEmitter {
|
|
|
105
177
|
};
|
|
106
178
|
}
|
|
107
179
|
|
|
180
|
+
private documentKey(doc: Document): string {
|
|
181
|
+
if (doc && typeof doc === "object") {
|
|
182
|
+
const explicitId = (doc as any).id ?? (doc as any).docId ?? (doc as any).url;
|
|
183
|
+
if (typeof explicitId === "string" && explicitId.length) {
|
|
184
|
+
return explicitId;
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
try {
|
|
188
|
+
return JSON.stringify(doc);
|
|
189
|
+
} catch {
|
|
190
|
+
return `${Date.now()}-${Math.random()}`;
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
|
|
108
194
|
private emitDocuments(docs: Document[]) {
|
|
109
|
-
for (const doc of docs)
|
|
195
|
+
for (const doc of docs) {
|
|
196
|
+
if (!doc) continue;
|
|
197
|
+
const key = this.documentKey(doc);
|
|
198
|
+
if (this.emittedDocumentKeys.has(key)) continue;
|
|
199
|
+
this.emittedDocumentKeys.add(key);
|
|
200
|
+
this.emit("document", { ...(doc as any), id: this.jobId });
|
|
201
|
+
}
|
|
110
202
|
}
|
|
111
203
|
|
|
112
204
|
private emitSnapshot(payload: any) {
|
|
@@ -148,6 +240,7 @@ export class Watcher extends EventEmitter {
|
|
|
148
240
|
const snap = this.kind === "crawl"
|
|
149
241
|
? await getCrawlStatus(this.http as any, this.jobId)
|
|
150
242
|
: await getBatchScrapeStatus(this.http as any, this.jobId);
|
|
243
|
+
this.emitDocuments((snap.data || []) as Document[]);
|
|
151
244
|
this.emit("snapshot", snap);
|
|
152
245
|
if (["completed", "failed", "cancelled"].includes(snap.status)) {
|
|
153
246
|
this.emit("done", { status: snap.status, data: snap.data, id: this.jobId });
|