@mattgrill/nearline-node 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ import { type DuplicateFinderOptions, type DuplicatePair, type FindDuplicatesResult } from "@mattgrill/nearline-core";
2
+ /**
3
+ * Find near-duplicate strings in a dataset (Node.js-optimized).
4
+ * Uses worker_threads for parallel computation above 5000 strings.
5
+ */
6
+ export declare function findDuplicates(strings: string[], opts?: DuplicateFinderOptions): Promise<FindDuplicatesResult>;
7
+ /**
8
+ * Class API for incremental use and querying against a built index (Node.js-optimized).
9
+ */
10
+ export declare class DuplicateFinder {
11
+ private options;
12
+ private strings;
13
+ private signatures;
14
+ private hashA;
15
+ private hashB;
16
+ constructor(opts?: DuplicateFinderOptions);
17
+ addStrings(strings: string[]): void;
18
+ buildIndex(): Promise<void>;
19
+ query(str: string): DuplicatePair[];
20
+ }
21
+ //# sourceMappingURL=finder.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"finder.d.ts","sourceRoot":"","sources":["../src/finder.ts"],"names":[],"mappings":"AACA,OAAO,EACL,KAAK,sBAAsB,EAC3B,KAAK,aAAa,EAClB,KAAK,oBAAoB,EAO1B,MAAM,0BAA0B,CAAC;AAGlC;;;GAGG;AACH,wBAAsB,cAAc,CAClC,OAAO,EAAE,MAAM,EAAE,EACjB,IAAI,CAAC,EAAE,sBAAsB,GAC5B,OAAO,CAAC,oBAAoB,CAAC,CAK/B;AAED;;GAEG;AACH,qBAAa,eAAe;IAC1B,OAAO,CAAC,OAAO,CAAkB;IACjC,OAAO,CAAC,OAAO,CAAgB;IAC/B,OAAO,CAAC,UAAU,CAA4B;IAC9C,OAAO,CAAC,KAAK,CAAc;IAC3B,OAAO,CAAC,KAAK,CAAc;gBAEf,IAAI,CAAC,EAAE,sBAAsB;IAWzC,UAAU,CAAC,OAAO,EAAE,MAAM,EAAE,GAAG,IAAI;IAK7B,UAAU,IAAI,OAAO,CAAC,IAAI,CAAC;IAyBjC,KAAK,CAAC,GAAG,EAAE,MAAM,GAAG,aAAa,EAAE;CAapC"}
package/dist/index.cjs ADDED
@@ -0,0 +1,2 @@
1
+ (()=>{"use strict";var t={};t.d=(e,s)=>{for(var r in s)t.o(s,r)&&!t.o(e,r)&&Object.defineProperty(e,r,{enumerable:!0,get:s[r]})},t.o=(t,e)=>Object.prototype.hasOwnProperty.call(t,e),t.r=t=>{"u">typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(t,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(t,"__esModule",{value:!0})};var e={};t.r(e),t.d(e,{DuplicateFinder:()=>l,findDuplicates:()=>h});let s=require("node:os"),r=require("@mattgrill/nearline-core"),i=require("node:worker_threads"),n=require("node:path"),o=require("node:url");function a(){let t;try{t=function(){let t;try{t=(0,n.dirname)((0,o.fileURLToPath)("file:///Users/matthewgrill/Developer/js/duplicate-finder/packages/server/src/workers/pool.ts"))}catch{t=__dirname}return(0,n.join)(t,"minhash-worker.cjs")}()}catch{return null}return async(e,s,r,n,o,a,h)=>{let l=e.length,u=new Uint32Array(l*r),p=[];for(let t=0;t<l;t+=h)p.push({start:t,end:Math.min(t+h,l)});let c=Math.min(a,p.length),g=0,d=()=>new Promise((a,h)=>{let l=()=>{if(g>=p.length)return void a();let c=p[g++],d=e.slice(c.start,c.end),m=new i.Worker(t,{workerData:{strings:d,startIndex:c.start,ngramSize:s,numPermutations:r,hashA:Array.from(n),hashB:Array.from(o)}});m.on("message",t=>{let e=new Uint32Array(t.signatures),s=t.startIndex*r;u.set(e,s),m.terminate().then(l).catch(h)}),m.on("error",h)};l()}),m=[];for(let t=0;t<c;t++)m.push(d());return await Promise.all(m),u}}async function h(t,e){let i=Math.max(1,(0,s.cpus)().length-1),n=(0,r.resolveOptions)(e,i),o=a();return(0,r.executePipeline)(t,n,o)}class l{options;strings=[];signatures=null;hashA;hashB;constructor(t){const e=Math.max(1,(0,s.cpus)().length-1);this.options=(0,r.resolveOptions)(t,e);const{hashA:i,hashB:n}=(0,r.generateHashCoefficients)(this.options.numPermutations,this.options.seed);this.hashA=i,this.hashB=n}addStrings(t){this.strings.push(...t),this.signatures=null}async buildIndex(){let t=a();t&&this.options.workers>0&&this.strings.length>=5e3?this.signatures=await t(this.strings,this.options.ngramSize,this.options.numPermutations,this.hashA,this.hashB,this.options.workers,this.options.batchSize):this.signatures=(0,r.computeAllSignatures)(this.strings,this.options.ngramSize,this.options.numPermutations,this.hashA,this.hashB,this.options.preprocess)}query(t){if(!this.signatures)throw Error("Index not built. Call buildIndex() first.");return(0,r.querySignatures)(t,this.options,this.signatures,this.hashA,this.hashB,this.strings.length)}}module.exports=e})();
2
+ //# sourceMappingURL=index.cjs.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.cjs","sources":["webpack://@mattgrill/nearline-node/webpack/runtime/define_property_getters","webpack://@mattgrill/nearline-node/webpack/runtime/has_own_property","webpack://@mattgrill/nearline-node/webpack/runtime/make_namespace_object","webpack://@mattgrill/nearline-node/./src/workers/pool.ts","webpack://@mattgrill/nearline-node/./src/finder.ts"],"sourcesContent":["__webpack_require__.d = (exports, definition) => {\n\tfor(var key in definition) {\n if(__webpack_require__.o(definition, key) && !__webpack_require__.o(exports, key)) {\n Object.defineProperty(exports, key, { enumerable: true, get: definition[key] });\n }\n }\n};","__webpack_require__.o = (obj, prop) => (Object.prototype.hasOwnProperty.call(obj, prop))","// define __esModule on exports\n__webpack_require__.r = (exports) => {\n\tif(typeof Symbol !== 'undefined' && Symbol.toStringTag) {\n\t\tObject.defineProperty(exports, Symbol.toStringTag, { value: 'Module' });\n\t}\n\tObject.defineProperty(exports, '__esModule', { value: true });\n};","import { Worker } from \"node:worker_threads\";\nimport { join, dirname } from \"node:path\";\nimport { fileURLToPath } from \"node:url\";\nimport type { ParallelComputeFn } from \"@mattgrill/nearline-core\";\n\nfunction resolveWorkerPath(): string {\n let currentDir: string;\n try {\n currentDir = dirname(fileURLToPath(import.meta.url));\n } catch {\n currentDir = __dirname;\n }\n return join(currentDir, \"minhash-worker.cjs\");\n}\n\n/**\n * Create a parallel compute function using Node.js worker_threads.\n * Returns null if the compiled worker file is not available.\n */\nexport function createNodeWorkerCompute(): ParallelComputeFn | null {\n let workerPath: string;\n try {\n workerPath = resolveWorkerPath();\n } catch {\n return null;\n }\n\n return async (\n strings: string[],\n ngramSize: number,\n numPermutations: number,\n hashA: Uint32Array,\n hashB: Uint32Array,\n numWorkers: number,\n batchSize: number\n ): Promise<Uint32Array> => {\n const n = strings.length;\n const signatures = new Uint32Array(n * numPermutations);\n\n const batches: Array<{ start: number; end: number }> = [];\n for (let i = 0; i < n; i += batchSize) {\n batches.push({ start: i, end: Math.min(i + batchSize, n) });\n }\n\n const actualWorkers = Math.min(numWorkers, batches.length);\n let batchIndex = 0;\n\n const processBatch = (): Promise<void> => {\n return new Promise((resolve, reject) => {\n const processNext = (): void => {\n if (batchIndex >= batches.length) {\n resolve();\n return;\n }\n\n const batch = batches[batchIndex++];\n const batchStrings = strings.slice(batch.start, batch.end);\n\n const worker = new Worker(workerPath, {\n workerData: {\n strings: batchStrings,\n startIndex: batch.start,\n ngramSize,\n numPermutations,\n hashA: Array.from(hashA),\n hashB: Array.from(hashB),\n },\n });\n\n worker.on(\n \"message\",\n (result: {\n signatures: number[];\n startIndex: number;\n count: number;\n }) => {\n const resultSigs = new Uint32Array(result.signatures);\n const offset = result.startIndex * numPermutations;\n signatures.set(resultSigs, offset);\n worker.terminate().then(processNext).catch(reject);\n }\n );\n\n worker.on(\"error\", reject);\n };\n\n processNext();\n });\n };\n\n const workerPromises: Promise<void>[] = [];\n for (let i = 0; i < actualWorkers; i++) {\n workerPromises.push(processBatch());\n }\n\n await Promise.all(workerPromises);\n return signatures;\n };\n}\n","import { cpus } from \"node:os\";\nimport {\n type DuplicateFinderOptions,\n type DuplicatePair,\n type FindDuplicatesResult,\n type ResolvedOptions,\n resolveOptions,\n executePipeline,\n querySignatures,\n generateHashCoefficients,\n computeAllSignatures,\n} from \"@mattgrill/nearline-core\";\nimport { createNodeWorkerCompute } from \"./workers/pool\";\n\n/**\n * Find near-duplicate strings in a dataset (Node.js-optimized).\n * Uses worker_threads for parallel computation above 5000 strings.\n */\nexport async function findDuplicates(\n strings: string[],\n opts?: DuplicateFinderOptions\n): Promise<FindDuplicatesResult> {\n const defaultWorkers = Math.max(1, cpus().length - 1);\n const options = resolveOptions(opts, defaultWorkers);\n const parallelCompute = createNodeWorkerCompute();\n return executePipeline(strings, options, parallelCompute);\n}\n\n/**\n * Class API for incremental use and querying against a built index (Node.js-optimized).\n */\nexport class DuplicateFinder {\n private options: ResolvedOptions;\n private strings: string[] = [];\n private signatures: Uint32Array | null = null;\n private hashA: Uint32Array;\n private hashB: Uint32Array;\n\n constructor(opts?: DuplicateFinderOptions) {\n const defaultWorkers = Math.max(1, cpus().length - 1);\n this.options = resolveOptions(opts, defaultWorkers);\n const { hashA, hashB } = generateHashCoefficients(\n this.options.numPermutations,\n this.options.seed\n );\n this.hashA = hashA;\n this.hashB = hashB;\n }\n\n addStrings(strings: string[]): void {\n this.strings.push(...strings);\n this.signatures = null;\n }\n\n async buildIndex(): Promise<void> {\n const parallelCompute = createNodeWorkerCompute();\n\n if (parallelCompute && this.options.workers > 0 && this.strings.length >= 5000) {\n this.signatures = await parallelCompute(\n this.strings,\n this.options.ngramSize,\n this.options.numPermutations,\n this.hashA,\n this.hashB,\n this.options.workers,\n this.options.batchSize\n );\n } else {\n this.signatures = computeAllSignatures(\n this.strings,\n this.options.ngramSize,\n this.options.numPermutations,\n this.hashA,\n this.hashB,\n this.options.preprocess\n );\n }\n }\n\n query(str: string): DuplicatePair[] {\n if (!this.signatures) {\n throw new Error(\"Index not built. Call buildIndex() first.\");\n }\n return querySignatures(\n str,\n this.options,\n this.signatures,\n this.hashA,\n this.hashB,\n this.strings.length\n );\n }\n}\n"],"names":["e","Object","Symbol","createNodeWorkerCompute","workerPath","resolveWorkerPath","currentDir","dirname","fileURLToPath","__dirname","join","strings","ngramSize","numPermutations","hashA","hashB","numWorkers","batchSize","n","signatures","Uint32Array","batches","i","Math","actualWorkers","batchIndex","processBatch","Promise","resolve","reject","processNext","batch","batchStrings","worker","Worker","Array","result","resultSigs","offset","workerPromises","findDuplicates","opts","defaultWorkers","cpus","options","resolveOptions","parallelCompute","executePipeline","DuplicateFinder","generateHashCoefficients","computeAllSignatures","str","Error","querySignatures"],"mappings":"2BAAA,GAAoB,CAAC,CAAG,CAACA,EAAS,KACjC,IAAI,IAAI,KAAO,EACL,EAAoB,CAAC,CAAC,EAAY,IAAQ,CAAC,EAAoB,CAAC,CAACA,EAAS,IACzEC,OAAO,cAAc,CAACD,EAAS,EAAK,CAAE,WAAY,GAAM,IAAK,CAAU,CAAC,EAAI,AAAC,EAGzF,ECNA,EAAoB,CAAC,CAAG,CAAC,EAAK,IAAUC,OAAO,SAAS,CAAC,cAAc,CAAC,IAAI,CAAC,EAAK,GCClF,EAAoB,CAAC,CAAG,AAAC,IACrB,AAAkB,IAAlB,OAAOC,QAA0BA,OAAO,WAAW,EACrDD,OAAO,cAAc,CAAC,EAASC,OAAO,WAAW,CAAE,CAAE,MAAO,QAAS,GAEtED,OAAO,cAAc,CAAC,EAAS,aAAc,CAAE,MAAO,EAAK,EAC5D,E,iNCaO,SAASE,IACd,IAAIC,EACJ,GAAI,CACFA,EAAaC,AAjBjB,WACE,IAAIC,EACJ,GAAI,CACFA,EAAaC,AAAAA,GAAAA,EAAAA,OAAAA,AAAAA,EAAQC,AAAAA,GAAAA,EAAAA,aAAAA,AAAAA,EAAc,gGACrC,CAAE,KAAM,CACNF,EAAaG,SACf,CACA,MAAOC,AAAAA,GAAAA,EAAAA,IAAAA,AAAAA,EAAKJ,EAAY,qBAC1B,GAUE,CAAE,KAAM,CACN,OAAO,IACT,CAEA,OAAO,MACLK,EACAC,EACAC,EACAC,EACAC,EACAC,EACAC,KAEA,IAAMC,EAAIP,EAAQ,MAAM,CAClBQ,EAAa,IAAIC,YAAYF,EAAIL,GAEjCQ,EAAiD,EAAE,CACzD,IAAK,IAAIC,EAAI,EAAGA,EAAIJ,EAAGI,GAAKL,EAC1BI,EAAQ,IAAI,CAAC,CAAE,MAAOC,EAAG,IAAKC,KAAK,GAAG,CAACD,EAAIL,EAAWC,EAAG,GAG3D,IAAMM,EAAgBD,KAAK,GAAG,CAACP,EAAYK,EAAQ,MAAM,EACrDI,EAAa,EAEXC,EAAe,IACZ,IAAIC,QAAQ,CAACC,EAASC,KAC3B,IAAMC,EAAc,KAClB,GAAIL,GAAcJ,EAAQ,MAAM,CAAE,YAChCO,IAIF,IAAMG,EAAQV,CAAO,CAACI,IAAa,CAC7BO,EAAerB,EAAQ,KAAK,CAACoB,EAAM,KAAK,CAAEA,EAAM,GAAG,EAEnDE,EAAS,IAAIC,EAAAA,MAAMA,CAAC9B,EAAY,CACpC,WAAY,CACV,QAAS4B,EACT,WAAYD,EAAM,KAAK,CACvBnB,UAAAA,EACAC,gBAAAA,EACA,MAAOsB,MAAM,IAAI,CAACrB,GAClB,MAAOqB,MAAM,IAAI,CAACpB,EACpB,CACF,GAEAkB,EAAO,EAAE,CACP,UACA,AAACG,IAKC,IAAMC,EAAa,IAAIjB,YAAYgB,EAAO,UAAU,EAC9CE,EAASF,EAAO,UAAU,CAAGvB,EACnCM,EAAW,GAAG,CAACkB,EAAYC,GAC3BL,EAAO,SAAS,GAAG,IAAI,CAACH,GAAa,KAAK,CAACD,EAC7C,GAGFI,EAAO,EAAE,CAAC,QAASJ,EACrB,EAEAC,GACF,GAGIS,EAAkC,EAAE,CAC1C,IAAK,IAAIjB,EAAI,EAAGA,EAAIE,EAAeF,IACjCiB,EAAe,IAAI,CAACb,KAItB,OADA,MAAMC,QAAQ,GAAG,CAACY,GACXpB,CACT,CACF,CChFO,eAAeqB,EACpB7B,CAAiB,CACjB8B,CAA6B,EAE7B,IAAMC,EAAiBnB,KAAK,GAAG,CAAC,EAAGoB,AAAAA,GAAAA,EAAAA,IAAAA,AAAAA,IAAO,MAAM,CAAG,GAC7CC,EAAUC,AAAAA,GAAAA,EAAAA,cAAAA,AAAAA,EAAeJ,EAAMC,GAC/BI,EAAkB3C,IACxB,MAAO4C,AAAAA,GAAAA,EAAAA,eAAAA,AAAAA,EAAgBpC,EAASiC,EAASE,EAC3C,CAKO,MAAME,EACH,OAAyB,AACzB,SAAoB,EAAE,AAAC,AACvB,YAAiC,IAAK,AACtC,MAAmB,AACnB,MAAmB,AAE3B,aAAYP,CAA6B,CAAE,CACzC,MAAMC,EAAiBnB,KAAK,GAAG,CAAC,EAAGoB,AAAAA,GAAAA,EAAAA,IAAAA,AAAAA,IAAO,MAAM,CAAG,EACnD,KAAI,CAAC,OAAO,CAAGE,AAAAA,GAAAA,EAAAA,cAAAA,AAAAA,EAAeJ,EAAMC,GACpC,KAAM,CAAE5B,MAAAA,CAAK,CAAEC,MAAAA,CAAK,CAAE,CAAGkC,AAAAA,GAAAA,EAAAA,wBAAAA,AAAAA,EACvB,IAAI,CAAC,OAAO,CAAC,eAAe,CAC5B,IAAI,CAAC,OAAO,CAAC,IAAI,CAEnB,KAAI,CAAC,KAAK,CAAGnC,EACb,IAAI,CAAC,KAAK,CAAGC,CACf,CAEA,WAAWJ,CAAiB,CAAQ,CAClC,IAAI,CAAC,OAAO,CAAC,IAAI,IAAIA,GACrB,IAAI,CAAC,UAAU,CAAG,IACpB,CAEA,MAAM,YAA4B,CAChC,IAAMmC,EAAkB3C,GAEpB2C,CAAAA,GAAmB,IAAI,CAAC,OAAO,CAAC,OAAO,CAAG,GAAK,IAAI,CAAC,OAAO,CAAC,MAAM,EAAI,IACxE,IAAI,CAAC,UAAU,CAAG,MAAMA,EACtB,IAAI,CAAC,OAAO,CACZ,IAAI,CAAC,OAAO,CAAC,SAAS,CACtB,IAAI,CAAC,OAAO,CAAC,eAAe,CAC5B,IAAI,CAAC,KAAK,CACV,IAAI,CAAC,KAAK,CACV,IAAI,CAAC,OAAO,CAAC,OAAO,CACpB,IAAI,CAAC,OAAO,CAAC,SAAS,EAGxB,IAAI,CAAC,UAAU,CAAGI,AAAAA,GAAAA,EAAAA,oBAAAA,AAAAA,EAChB,IAAI,CAAC,OAAO,CACZ,IAAI,CAAC,OAAO,CAAC,SAAS,CACtB,IAAI,CAAC,OAAO,CAAC,eAAe,CAC5B,IAAI,CAAC,KAAK,CACV,IAAI,CAAC,KAAK,CACV,IAAI,CAAC,OAAO,CAAC,UAAU,CAG7B,CAEA,MAAMC,CAAW,CAAmB,CAClC,GAAI,CAAC,IAAI,CAAC,UAAU,CAClB,MAAM,AAAIC,MAAM,6CAElB,MAAOC,AAAAA,GAAAA,EAAAA,eAAAA,AAAAA,EACLF,EACA,IAAI,CAAC,OAAO,CACZ,IAAI,CAAC,UAAU,CACf,IAAI,CAAC,KAAK,CACV,IAAI,CAAC,KAAK,CACV,IAAI,CAAC,OAAO,CAAC,MAAM,CAEvB,CACF,C"}
@@ -0,0 +1,3 @@
1
+ export { findDuplicates, DuplicateFinder } from "./finder";
2
+ export type { DuplicateFinderOptions, DuplicatePair, DuplicateGroup, FindDuplicatesResult, FindDuplicatesStats, } from "@mattgrill/nearline-core";
3
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,cAAc,EAAE,eAAe,EAAE,MAAM,UAAU,CAAC;AAG3D,YAAY,EACV,sBAAsB,EACtB,aAAa,EACb,cAAc,EACd,oBAAoB,EACpB,mBAAmB,GACpB,MAAM,0BAA0B,CAAC"}
package/dist/index.js ADDED
@@ -0,0 +1,2 @@
1
+ import{fileURLToPath as t,fileURLToPath as s}from"node:url";import{dirname as r,dirname as e,join as i}from"node:path";import{cpus as n}from"node:os";import{computeAllSignatures as a,executePipeline as o,generateHashCoefficients as h,querySignatures as l,resolveOptions as p}from"@mattgrill/nearline-core";import{Worker as u}from"node:worker_threads";var m={};m.d=(t,s)=>{for(var r in s)m.o(s,r)&&!m.o(t,r)&&Object.defineProperty(t,r,{enumerable:!0,get:s[r]})},m.o=(t,s)=>Object.prototype.hasOwnProperty.call(t,s);var c={};m.d(c,{G:()=>w,I:()=>f});var g=r(t(import.meta.url));function d(){let t;try{t=function(){let t;try{t=e(s("file:///Users/matthewgrill/Developer/js/duplicate-finder/packages/server/src/workers/pool.ts"))}catch{t=g}return i(t,"minhash-worker.cjs")}()}catch{return null}return async(s,r,e,i,n,a,o)=>{let h=s.length,l=new Uint32Array(h*e),p=[];for(let t=0;t<h;t+=o)p.push({start:t,end:Math.min(t+o,h)});let m=Math.min(a,p.length),c=0,g=()=>new Promise((a,o)=>{let h=()=>{if(c>=p.length)return void a();let m=p[c++],g=s.slice(m.start,m.end),d=new u(t,{workerData:{strings:g,startIndex:m.start,ngramSize:r,numPermutations:e,hashA:Array.from(i),hashB:Array.from(n)}});d.on("message",t=>{let s=new Uint32Array(t.signatures),r=t.startIndex*e;l.set(s,r),d.terminate().then(h).catch(o)}),d.on("error",o)};h()}),d=[];for(let t=0;t<m;t++)d.push(g());return await Promise.all(d),l}}async function f(t,s){return o(t,p(s,Math.max(1,n().length-1)),d())}class w{options;strings=[];signatures=null;hashA;hashB;constructor(t){let s=Math.max(1,n().length-1);this.options=p(t,s);let{hashA:r,hashB:e}=h(this.options.numPermutations,this.options.seed);this.hashA=r,this.hashB=e}addStrings(t){this.strings.push(...t),this.signatures=null}async buildIndex(){let t=d();t&&this.options.workers>0&&this.strings.length>=5e3?this.signatures=await t(this.strings,this.options.ngramSize,this.options.numPermutations,this.hashA,this.hashB,this.options.workers,this.options.batchSize):this.signatures=a(this.strings,this.options.ngramSize,this.options.numPermutations,this.hashA,this.hashB,this.options.preprocess)}query(t){if(!this.signatures)throw Error("Index not built. Call buildIndex() first.");return l(t,this.options,this.signatures,this.hashA,this.hashB,this.strings.length)}}var y=c.G,_=c.I;export{y as DuplicateFinder,_ as findDuplicates};
2
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.js","sources":["webpack://@mattgrill/nearline-node/webpack/runtime/define_property_getters","webpack://@mattgrill/nearline-node/webpack/runtime/has_own_property","webpack://@mattgrill/nearline-node/./src/workers/pool.ts","webpack://@mattgrill/nearline-node/./src/finder.ts"],"sourcesContent":["__webpack_require__.d = (exports, definition) => {\n\tfor(var key in definition) {\n if(__webpack_require__.o(definition, key) && !__webpack_require__.o(exports, key)) {\n Object.defineProperty(exports, key, { enumerable: true, get: definition[key] });\n }\n }\n};","__webpack_require__.o = (obj, prop) => (Object.prototype.hasOwnProperty.call(obj, prop))","import { Worker } from \"node:worker_threads\";\nimport { join, dirname } from \"node:path\";\nimport { fileURLToPath } from \"node:url\";\nimport type { ParallelComputeFn } from \"@mattgrill/nearline-core\";\n\nfunction resolveWorkerPath(): string {\n let currentDir: string;\n try {\n currentDir = dirname(fileURLToPath(import.meta.url));\n } catch {\n currentDir = __dirname;\n }\n return join(currentDir, \"minhash-worker.cjs\");\n}\n\n/**\n * Create a parallel compute function using Node.js worker_threads.\n * Returns null if the compiled worker file is not available.\n */\nexport function createNodeWorkerCompute(): ParallelComputeFn | null {\n let workerPath: string;\n try {\n workerPath = resolveWorkerPath();\n } catch {\n return null;\n }\n\n return async (\n strings: string[],\n ngramSize: number,\n numPermutations: number,\n hashA: Uint32Array,\n hashB: Uint32Array,\n numWorkers: number,\n batchSize: number\n ): Promise<Uint32Array> => {\n const n = strings.length;\n const signatures = new Uint32Array(n * numPermutations);\n\n const batches: Array<{ start: number; end: number }> = [];\n for (let i = 0; i < n; i += batchSize) {\n batches.push({ start: i, end: Math.min(i + batchSize, n) });\n }\n\n const actualWorkers = Math.min(numWorkers, batches.length);\n let batchIndex = 0;\n\n const processBatch = (): Promise<void> => {\n return new Promise((resolve, reject) => {\n const processNext = (): void => {\n if (batchIndex >= batches.length) {\n resolve();\n return;\n }\n\n const batch = batches[batchIndex++];\n const batchStrings = strings.slice(batch.start, batch.end);\n\n const worker = new Worker(workerPath, {\n workerData: {\n strings: batchStrings,\n startIndex: batch.start,\n ngramSize,\n numPermutations,\n hashA: Array.from(hashA),\n hashB: Array.from(hashB),\n },\n });\n\n worker.on(\n \"message\",\n (result: {\n signatures: number[];\n startIndex: number;\n count: number;\n }) => {\n const resultSigs = new Uint32Array(result.signatures);\n const offset = result.startIndex * numPermutations;\n signatures.set(resultSigs, offset);\n worker.terminate().then(processNext).catch(reject);\n }\n );\n\n worker.on(\"error\", reject);\n };\n\n processNext();\n });\n };\n\n const workerPromises: Promise<void>[] = [];\n for (let i = 0; i < actualWorkers; i++) {\n workerPromises.push(processBatch());\n }\n\n await Promise.all(workerPromises);\n return signatures;\n };\n}\n","import { cpus } from \"node:os\";\nimport {\n type DuplicateFinderOptions,\n type DuplicatePair,\n type FindDuplicatesResult,\n type ResolvedOptions,\n resolveOptions,\n executePipeline,\n querySignatures,\n generateHashCoefficients,\n computeAllSignatures,\n} from \"@mattgrill/nearline-core\";\nimport { createNodeWorkerCompute } from \"./workers/pool\";\n\n/**\n * Find near-duplicate strings in a dataset (Node.js-optimized).\n * Uses worker_threads for parallel computation above 5000 strings.\n */\nexport async function findDuplicates(\n strings: string[],\n opts?: DuplicateFinderOptions\n): Promise<FindDuplicatesResult> {\n const defaultWorkers = Math.max(1, cpus().length - 1);\n const options = resolveOptions(opts, defaultWorkers);\n const parallelCompute = createNodeWorkerCompute();\n return executePipeline(strings, options, parallelCompute);\n}\n\n/**\n * Class API for incremental use and querying against a built index (Node.js-optimized).\n */\nexport class DuplicateFinder {\n private options: ResolvedOptions;\n private strings: string[] = [];\n private signatures: Uint32Array | null = null;\n private hashA: Uint32Array;\n private hashB: Uint32Array;\n\n constructor(opts?: DuplicateFinderOptions) {\n const defaultWorkers = Math.max(1, cpus().length - 1);\n this.options = resolveOptions(opts, defaultWorkers);\n const { hashA, hashB } = generateHashCoefficients(\n this.options.numPermutations,\n this.options.seed\n );\n this.hashA = hashA;\n this.hashB = hashB;\n }\n\n addStrings(strings: string[]): void {\n this.strings.push(...strings);\n this.signatures = null;\n }\n\n async buildIndex(): Promise<void> {\n const parallelCompute = createNodeWorkerCompute();\n\n if (parallelCompute && this.options.workers > 0 && this.strings.length >= 5000) {\n this.signatures = await parallelCompute(\n this.strings,\n this.options.ngramSize,\n this.options.numPermutations,\n this.hashA,\n this.hashB,\n this.options.workers,\n this.options.batchSize\n );\n } else {\n this.signatures = computeAllSignatures(\n this.strings,\n this.options.ngramSize,\n this.options.numPermutations,\n this.hashA,\n this.hashB,\n this.options.preprocess\n );\n }\n }\n\n query(str: string): DuplicatePair[] {\n if (!this.signatures) {\n throw new Error(\"Index not built. Call buildIndex() first.\");\n }\n return querySignatures(\n str,\n this.options,\n this.signatures,\n this.hashA,\n this.hashB,\n this.strings.length\n );\n }\n}\n"],"names":["Object","createNodeWorkerCompute","workerPath","resolveWorkerPath","currentDir","dirname","fileURLToPath","__dirname","join","strings","ngramSize","numPermutations","hashA","hashB","numWorkers","batchSize","n","signatures","Uint32Array","batches","i","Math","actualWorkers","batchIndex","processBatch","Promise","resolve","reject","processNext","batch","batchStrings","worker","Worker","Array","result","resultSigs","offset","workerPromises","findDuplicates","opts","executePipeline","resolveOptions","cpus","DuplicateFinder","defaultWorkers","generateHashCoefficients","parallelCompute","computeAllSignatures","str","Error","querySignatures"],"mappings":"uWAAA,GAAoB,CAAC,CAAG,CAAC,EAAS,KACjC,IAAI,IAAI,KAAO,EACL,EAAoB,CAAC,CAAC,EAAY,IAAQ,CAAC,EAAoB,CAAC,CAAC,EAAS,IACzEA,OAAO,cAAc,CAAC,EAAS,EAAK,CAAE,WAAY,GAAM,IAAK,CAAU,CAAC,EAAI,AAAC,EAGzF,ECNA,EAAoB,CAAC,CAAG,CAAC,EAAK,IAAUA,OAAO,SAAS,CAAC,cAAc,CAAC,IAAI,CAAC,EAAK,G,8DCmB3E,SAASC,IACd,IAAIC,EACJ,GAAI,CACFA,EAAaC,AAjBjB,WACE,IAAIC,EACJ,GAAI,CACFA,EAAaC,EAAQC,EAAc,gGACrC,CAAE,KAAM,CACNF,EAAaG,CACf,CACA,OAAOC,EAAKJ,EAAY,qBAC1B,GAUE,CAAE,KAAM,CACN,OAAO,IACT,CAEA,OAAO,MACLK,EACAC,EACAC,EACAC,EACAC,EACAC,EACAC,KAEA,IAAMC,EAAIP,EAAQ,MAAM,CAClBQ,EAAa,IAAIC,YAAYF,EAAIL,GAEjCQ,EAAiD,EAAE,CACzD,IAAK,IAAIC,EAAI,EAAGA,EAAIJ,EAAGI,GAAKL,EAC1BI,EAAQ,IAAI,CAAC,CAAE,MAAOC,EAAG,IAAKC,KAAK,GAAG,CAACD,EAAIL,EAAWC,EAAG,GAG3D,IAAMM,EAAgBD,KAAK,GAAG,CAACP,EAAYK,EAAQ,MAAM,EACrDI,EAAa,EAEXC,EAAe,IACZ,IAAIC,QAAQ,CAACC,EAASC,KAC3B,IAAMC,EAAc,KAClB,GAAIL,GAAcJ,EAAQ,MAAM,CAAE,YAChCO,IAIF,IAAMG,EAAQV,CAAO,CAACI,IAAa,CAC7BO,EAAerB,EAAQ,KAAK,CAACoB,EAAM,KAAK,CAAEA,EAAM,GAAG,EAEnDE,EAAS,IAAIC,EAAO9B,EAAY,CACpC,WAAY,CACV,QAAS4B,EACT,WAAYD,EAAM,KAAK,CACvBnB,UAAAA,EACAC,gBAAAA,EACA,MAAOsB,MAAM,IAAI,CAACrB,GAClB,MAAOqB,MAAM,IAAI,CAACpB,EACpB,CACF,GAEAkB,EAAO,EAAE,CACP,UACA,AAACG,IAKC,IAAMC,EAAa,IAAIjB,YAAYgB,EAAO,UAAU,EAC9CE,EAASF,EAAO,UAAU,CAAGvB,EACnCM,EAAW,GAAG,CAACkB,EAAYC,GAC3BL,EAAO,SAAS,GAAG,IAAI,CAACH,GAAa,KAAK,CAACD,EAC7C,GAGFI,EAAO,EAAE,CAAC,QAASJ,EACrB,EAEAC,GACF,GAGIS,EAAkC,EAAE,CAC1C,IAAK,IAAIjB,EAAI,EAAGA,EAAIE,EAAeF,IACjCiB,EAAe,IAAI,CAACb,KAItB,OADA,MAAMC,QAAQ,GAAG,CAACY,GACXpB,CACT,CACF,CChFO,eAAeqB,EACpB7B,CAAiB,CACjB8B,CAA6B,EAK7B,OAAOC,EAAgB/B,EAFPgC,EAAeF,EADRlB,KAAK,GAAG,CAAC,EAAGqB,IAAO,MAAM,CAAG,IAE3BzC,IAE1B,CAKO,MAAM0C,EACH,OAAyB,AACzB,SAAoB,EAAE,AAAC,AACvB,YAAiC,IAAK,AACtC,MAAmB,AACnB,MAAmB,AAE3B,aAAYJ,CAA6B,CAAE,CACzC,IAAMK,EAAiBvB,KAAK,GAAG,CAAC,EAAGqB,IAAO,MAAM,CAAG,EACnD,KAAI,CAAC,OAAO,CAAGD,EAAeF,EAAMK,GACpC,GAAM,CAAEhC,MAAAA,CAAK,CAAEC,MAAAA,CAAK,CAAE,CAAGgC,EACvB,IAAI,CAAC,OAAO,CAAC,eAAe,CAC5B,IAAI,CAAC,OAAO,CAAC,IAAI,CAEnB,KAAI,CAAC,KAAK,CAAGjC,EACb,IAAI,CAAC,KAAK,CAAGC,CACf,CAEA,WAAWJ,CAAiB,CAAQ,CAClC,IAAI,CAAC,OAAO,CAAC,IAAI,IAAIA,GACrB,IAAI,CAAC,UAAU,CAAG,IACpB,CAEA,MAAM,YAA4B,CAChC,IAAMqC,EAAkB7C,GAEpB6C,CAAAA,GAAmB,IAAI,CAAC,OAAO,CAAC,OAAO,CAAG,GAAK,IAAI,CAAC,OAAO,CAAC,MAAM,EAAI,IACxE,IAAI,CAAC,UAAU,CAAG,MAAMA,EACtB,IAAI,CAAC,OAAO,CACZ,IAAI,CAAC,OAAO,CAAC,SAAS,CACtB,IAAI,CAAC,OAAO,CAAC,eAAe,CAC5B,IAAI,CAAC,KAAK,CACV,IAAI,CAAC,KAAK,CACV,IAAI,CAAC,OAAO,CAAC,OAAO,CACpB,IAAI,CAAC,OAAO,CAAC,SAAS,EAGxB,IAAI,CAAC,UAAU,CAAGC,EAChB,IAAI,CAAC,OAAO,CACZ,IAAI,CAAC,OAAO,CAAC,SAAS,CACtB,IAAI,CAAC,OAAO,CAAC,eAAe,CAC5B,IAAI,CAAC,KAAK,CACV,IAAI,CAAC,KAAK,CACV,IAAI,CAAC,OAAO,CAAC,UAAU,CAG7B,CAEA,MAAMC,CAAW,CAAmB,CAClC,GAAI,CAAC,IAAI,CAAC,UAAU,CAClB,MAAM,AAAIC,MAAM,6CAElB,OAAOC,EACLF,EACA,IAAI,CAAC,OAAO,CACZ,IAAI,CAAC,UAAU,CACf,IAAI,CAAC,KAAK,CACV,IAAI,CAAC,KAAK,CACV,IAAI,CAAC,OAAO,CAAC,MAAM,CAEvB,CACF,C"}
@@ -0,0 +1,2 @@
1
+ (()=>{"use strict";let t=require("node:worker_threads");var e={};e.d=(t,r)=>{for(var n in r)e.o(r,n)&&!e.o(t,n)&&Object.defineProperty(t,n,{enumerable:!0,get:r[n]})},e.o=(t,e)=>Object.prototype.hasOwnProperty.call(t,e);var r={};function n(t,e=0,r=t.length){let i=0x811c9dc5,o=e+r;for(let r=e;r<o;r++)i^=t.charCodeAt(r),i=Math.imul(i,0x1000193);return i>>>0}function i(t,e,r){let n=0x811c9dc5,i=e+r;for(let r=e;r<i;r++){let e=t[r];n^=255&e,n=Math.imul(n=Math.imul(n=Math.imul(n=Math.imul(n,0x1000193)^e>>>8&255,0x1000193)^e>>>16&255,0x1000193)^e>>>24&255,0x1000193)}return n>>>0}function o(t){return t^=t<<13,t^=t>>>17,(t^=t<<5)>>>0}function a(t,e){let r=new Uint32Array(t),n=new Uint32Array(t),i=0===e?1:e;for(let e=0;e<t;e++)i=o(i),r[e]=i,i=o(i),n[e]=i;return{hashA:r,hashB:n}}function s(t,e,r){let n=t;return((n=Math.imul((n=Math.imul(n^e,0x85ebca6b))^n>>>13,0xc2b2ae35))^r^n>>>16)>>>0}function u(t,e,r,n,i,o){for(let t=0;t<n;t++)i[o+t]=0xffffffff;let a=t.length;if(0!==a)for(let u=0;u<a;u++){let a=t[u];for(let t=0;t<n;t++){let n=s(a,e[t],r[t]);n<i[o+t]&&(i[o+t]=n)}}}function l(t,e,r,i,o,a){let u=t.length,l=new Uint32Array(u*r);for(let f=0;f<u;f++)!function(t,e,r,i,o,a,u){for(let t=0;t<o;t++)a[u+t]=0xffffffff;let l=t.length;if(0===l)return;if(l<e){let e=n(t,0,l);for(let t=0;t<o;t++)a[u+t]=s(e,r[t],i[t]);return}let f=l-e+1;for(let l=0;l<f;l++){let f=n(t,l,e);for(let t=0;t<o;t++){let e=s(f,r[t],i[t]);e<a[u+t]&&(a[u+t]=e)}}}(a?a(t[f]):t[f],e,i,o,r,l,f*r);return l}function f(t,e,r,n){let i=e*n,o=r*n,a=0;for(let e=0;e<n;e++)t[i+e]===t[o+e]&&a++;return a/n}function h(t,e,r,n,o){let a=new Set;for(let s=0;s<n;s++){let n=s*o,u=new Map;for(let a=0;a<e;a++){let e=i(t,a*r+n,o),s=u.get(e);s||(s=[],u.set(e,s)),s.push(a)}for(let t of u.values())if(!(t.length<2))for(let r=0;r<t.length;r++)for(let n=r+1;n<t.length;n++)a.add(t[r]*e+t[n])}return a}function m(t,e){return[t/e|0,t%e]}e.d(r,{HP:()=>c,sB:()=>y,wQ:()=>i,eE:()=>h,$i:()=>g,vm:()=>m,eL:()=>w,u5:()=>l,Wh:()=>n,g3:()=>a,_M:()=>f,aW:()=>u,Bu:()=>d,nC:()=>p});class p{parent;rank;constructor(t){this.parent=new Uint32Array(t),this.rank=new Uint8Array(t);for(let e=0;e<t;e++)this.parent[e]=e}find(t){let e=t;for(;this.parent[e]!==e;)e=this.parent[e];for(;this.parent[t]!==e;){let r=this.parent[t];this.parent[t]=e,t=r}return e}union(t,e){let r=this.find(t),n=this.find(e);r!==n&&(this.rank[r]<this.rank[n]?this.parent[r]=n:this.rank[r]>this.rank[n]?this.parent[n]=r:(this.parent[n]=r,this.rank[r]++))}getGroups(t){let e=new Map;for(let r of t){let t=this.find(r),n=e.get(t);n||(n=[],e.set(t,n)),n.push(r)}let r=[];for(let t of e.values())t.length>1&&(t.sort((t,e)=>t-e),r.push(t));return r.sort((t,e)=>t[0]-e[0]),r}}function c(t,e){let r=new Set,i=t.length;if(i<e)return i>0&&r.add(n(t,0,i)),r;let o=i-e+1;for(let i=0;i<o;i++)r.add(n(t,i,e));return r}function d(t){let e=new Uint32Array(t.size),r=0;for(let n of t)e[r++]=n;return e}function g(t,e){let r;return r=t?.workers===void 0||t?.workers==="auto"?e:t.workers,{threshold:t?.threshold??.8,ngramSize:t?.ngramSize??3,numPermutations:t?.numPermutations??128,numBands:t?.numBands??32,bandSize:t?.bandSize??4,workers:r,batchSize:t?.batchSize??1e3,preprocess:t?.preprocess??null,seed:t?.seed??42}}async function w(t,e,r){let n,i=performance.now(),o=t.length;if(o<2)return{pairs:[],groups:[],stats:{totalTimeMs:performance.now()-i,inputCount:o,candidateCount:0,duplicateCount:0,groupCount:0}};let{hashA:s,hashB:u}=a(e.numPermutations,e.seed),c=h(n=r&&e.workers>0&&o>=5e3?await r(t,e.ngramSize,e.numPermutations,s,u,e.workers,e.batchSize):l(t,e.ngramSize,e.numPermutations,s,u,e.preprocess),o,e.numPermutations,e.numBands,e.bandSize),d=[],g=new p(o),w=new Set;for(let t of c){let[r,i]=m(t,o),a=f(n,r,i,e.numPermutations);a>=e.threshold&&(d.push({indexA:r,indexB:i,similarity:a}),g.union(r,i),w.add(r),w.add(i))}d.sort((t,e)=>e.similarity-t.similarity);let y=g.getGroups(w).map(t=>({indices:t}));return{pairs:d,groups:y,stats:{totalTimeMs:performance.now()-i,inputCount:o,candidateCount:c.size,duplicateCount:d.length,groupCount:y.length}}}function y(t,e,r,n,i,o){let a=t;e.preprocess&&(a=e.preprocess(a));let s=d(c(a,e.ngramSize)),l=new Uint32Array(e.numPermutations);u(s,n,i,e.numPermutations,l,0);let f=[];for(let t=0;t<o;t++){let n=t*e.numPermutations,i=0;for(let t=0;t<e.numPermutations;t++)l[t]===r[n+t]&&i++;let o=i/e.numPermutations;o>=e.threshold&&f.push({indexA:-1,indexB:t,similarity:o})}return f.sort((t,e)=>e.similarity-t.similarity),f}r.nC;var P=r.u5;r.aW,r.vm,r._M,r.eL,r.eE,r.Wh,r.wQ,r.g3,r.sB,r.$i,r.HP,r.Bu;let x=t.workerData,{strings:A,startIndex:k,ngramSize:M,numPermutations:S}=x,z=P(A,M,S,new Uint32Array(x.hashA),new Uint32Array(x.hashB),null);t.parentPort.postMessage({signatures:Array.from(z),startIndex:k,count:A.length}),module.exports={}})();
2
+ //# sourceMappingURL=minhash-worker.cjs.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"minhash-worker.cjs","sources":["webpack://@mattgrill/nearline-node/../core/dist/index.js","webpack://@mattgrill/nearline-node/./src/workers/minhash-worker.ts"],"sourcesContent":["var e={};e.d=(t,n)=>{for(var r in n)e.o(n,r)&&!e.o(t,r)&&Object.defineProperty(t,r,{enumerable:!0,get:n[r]})},e.o=(e,t)=>Object.prototype.hasOwnProperty.call(e,t);var t={};function n(e,t=0,r=e.length){let i=0x811c9dc5,a=t+r;for(let n=t;n<a;n++)i^=e.charCodeAt(n),i=Math.imul(i,0x1000193);return i>>>0}function r(e,t,n){let r=0x811c9dc5,i=t+n;for(let n=t;n<i;n++){let t=e[n];r^=255&t,r=Math.imul(r=Math.imul(r=Math.imul(r=Math.imul(r,0x1000193)^t>>>8&255,0x1000193)^t>>>16&255,0x1000193)^t>>>24&255,0x1000193)}return r>>>0}function i(e){return e^=e<<13,e^=e>>>17,(e^=e<<5)>>>0}function a(e,t){let n=new Uint32Array(e),r=new Uint32Array(e),a=0===t?1:t;for(let t=0;t<e;t++)a=i(a),n[t]=a,a=i(a),r[t]=a;return{hashA:n,hashB:r}}function o(e,t,n){let r=e;return((r=Math.imul((r=Math.imul(r^t,0x85ebca6b))^r>>>13,0xc2b2ae35))^n^r>>>16)>>>0}function s(e,t,n,r,i,a){for(let e=0;e<r;e++)i[a+e]=0xffffffff;let s=e.length;if(0!==s)for(let u=0;u<s;u++){let s=e[u];for(let e=0;e<r;e++){let r=o(s,t[e],n[e]);r<i[a+e]&&(i[a+e]=r)}}}function u(e,t,r,i,a,s){let u=e.length,l=new Uint32Array(u*r);for(let f=0;f<u;f++)!function(e,t,r,i,a,s,u){for(let e=0;e<a;e++)s[u+e]=0xffffffff;let l=e.length;if(0===l)return;if(l<t){let t=n(e,0,l);for(let e=0;e<a;e++)s[u+e]=o(t,r[e],i[e]);return}let f=l-t+1;for(let l=0;l<f;l++){let f=n(e,l,t);for(let e=0;e<a;e++){let t=o(f,r[e],i[e]);t<s[u+e]&&(s[u+e]=t)}}}(s?s(e[f]):e[f],t,i,a,r,l,f*r);return l}function l(e,t,n,r){let i=t*r,a=n*r,o=0;for(let t=0;t<r;t++)e[i+t]===e[a+t]&&o++;return o/r}function f(e,t,n,i,a){let o=new Set;for(let u=0;u<i;u++){let i=u*a,l=new Map;for(let o=0;o<t;o++){let t=r(e,o*n+i,a),s=l.get(t);s||(s=[],l.set(t,s)),s.push(o)}for(let e of l.values())if(!(e.length<2))for(let n=0;n<e.length;n++)for(let r=n+1;r<e.length;r++){var s;o.add((s=e[n],s*t+e[r]))}}return o}function p(e,t){return[e/t|0,e%t]}e.d(t,{HP:()=>c,sB:()=>g,wQ:()=>r,eE:()=>f,$i:()=>m,vm:()=>p,eL:()=>d,u5:()=>u,Wh:()=>n,g3:()=>a,_M:()=>l,aW:()=>s,Bu:()=>h,nC:()=>_});class _{parent;rank;constructor(e){this.parent=new Uint32Array(e),this.rank=new Uint8Array(e);for(let t=0;t<e;t++)this.parent[t]=t}find(e){let t=e;for(;this.parent[t]!==t;)t=this.parent[t];for(;this.parent[e]!==t;){let n=this.parent[e];this.parent[e]=t,e=n}return t}union(e,t){let n=this.find(e),r=this.find(t);n!==r&&(this.rank[n]<this.rank[r]?this.parent[n]=r:this.rank[n]>this.rank[r]?this.parent[r]=n:(this.parent[r]=n,this.rank[n]++))}getGroups(e){let t=new Map;for(let n of e){let e=this.find(n),r=t.get(e);r||(r=[],t.set(e,r)),r.push(n)}let n=[];for(let e of t.values())e.length>1&&(e.sort((e,t)=>e-t),n.push(e));return n.sort((e,t)=>e[0]-t[0]),n}}function c(e,t){let r=new Set,i=e.length;if(i<t)return i>0&&r.add(n(e,0,i)),r;let a=i-t+1;for(let i=0;i<a;i++)r.add(n(e,i,t));return r}function h(e){let t=new Uint32Array(e.size),n=0;for(let r of e)t[n++]=r;return t}function m(e,t){let n;return n=e?.workers===void 0||e?.workers===\"auto\"?t:e.workers,{threshold:e?.threshold??.8,ngramSize:e?.ngramSize??3,numPermutations:e?.numPermutations??128,numBands:e?.numBands??32,bandSize:e?.bandSize??4,workers:n,batchSize:e?.batchSize??1e3,preprocess:e?.preprocess??null,seed:e?.seed??42}}async function d(e,t,n){let r,i=performance.now(),o=e.length;if(o<2)return{pairs:[],groups:[],stats:{totalTimeMs:performance.now()-i,inputCount:o,candidateCount:0,duplicateCount:0,groupCount:0}};let{hashA:s,hashB:c}=a(t.numPermutations,t.seed),h=f(r=n&&t.workers>0&&o>=5e3?await n(e,t.ngramSize,t.numPermutations,s,c,t.workers,t.batchSize):u(e,t.ngramSize,t.numPermutations,s,c,t.preprocess),o,t.numPermutations,t.numBands,t.bandSize),m=[],d=new _(o),g=new Set;for(let e of h){let[n,i]=p(e,o),a=l(r,n,i,t.numPermutations);a>=t.threshold&&(m.push({indexA:n,indexB:i,similarity:a}),d.union(n,i),g.add(n),g.add(i))}m.sort((e,t)=>t.similarity-e.similarity);let w=d.getGroups(g).map(e=>({indices:e}));return{pairs:m,groups:w,stats:{totalTimeMs:performance.now()-i,inputCount:o,candidateCount:h.size,duplicateCount:m.length,groupCount:w.length}}}function g(e,t,n,r,i,a){let o=e;t.preprocess&&(o=t.preprocess(o));let u=h(c(o,t.ngramSize)),l=new Uint32Array(t.numPermutations);s(u,r,i,t.numPermutations,l,0);let f=[];for(let e=0;e<a;e++){let r=e*t.numPermutations,i=0;for(let e=0;e<t.numPermutations;e++)l[e]===n[r+e]&&i++;let a=i/t.numPermutations;a>=t.threshold&&f.push({indexA:-1,indexB:e,similarity:a})}return f.sort((e,t)=>t.similarity-e.similarity),f}var w=t.nC,x=t.u5,b=t.aW,k=t.vm,S=t._M,y=t.eL,P=t.eE,v=t.Wh,C=t.wQ,A=t.g3,z=t.sB,M=t.$i,U=t.HP,B=t.Bu;export{w as UnionFind,x as computeAllSignatures,b as computeSignature,k as decodePair,S as estimateSimilarity,y as executePipeline,P as findCandidates,v as fnv1aString,C as fnv1aUint32,A as generateHashCoefficients,z as querySignatures,M as resolveOptions,U as shingle,B as shingleToArray};\n//# sourceMappingURL=index.js.map","import { parentPort, workerData } from \"node:worker_threads\";\nimport { computeAllSignatures } from \"@mattgrill/nearline-core\";\n\ninterface WorkerInput {\n strings: string[];\n startIndex: number;\n ngramSize: number;\n numPermutations: number;\n hashA: number[];\n hashB: number[];\n}\n\nconst data = workerData as WorkerInput;\nconst { strings, startIndex, ngramSize, numPermutations } = data;\nconst hashA = new Uint32Array(data.hashA);\nconst hashB = new Uint32Array(data.hashB);\n\nconst signatures = computeAllSignatures(\n strings, ngramSize, numPermutations, hashA, hashB, null\n);\n\nparentPort!.postMessage({\n signatures: Array.from(signatures),\n startIndex,\n count: strings.length,\n});\n"],"names":["e","t","Object","n","r","i","Math","a","Uint32Array","u","l","f","Set","Map","s","c","Uint8Array","performance","o","data","workerData","strings","startIndex","ngramSize","numPermutations","signatures","computeAllSignatures","parentPort","Array"],"mappings":"wDAAA,IAAIA,EAAE,CAAC,CAAEA,CAAAA,EAAE,CAAC,CAAC,CAACC,EAAE,KAAK,IAAI,IAAI,KAAK,EAAED,EAAE,CAAC,CAAC,EAAE,IAAI,CAACA,EAAE,CAAC,CAACC,EAAE,IAAIC,OAAO,cAAc,CAACD,EAAE,EAAE,CAAC,WAAW,CAAC,EAAE,IAAI,CAAC,CAAC,EAAE,EAAE,EAAED,EAAE,CAAC,CAAC,CAAC,EAAE,IAAIE,OAAO,SAAS,CAAC,cAAc,CAAC,IAAI,CAAC,EAAE,GAAG,IAAI,EAAE,CAAC,EAAE,SAASC,EAAE,CAAC,CAAC,EAAE,CAAC,CAACC,EAAE,EAAE,MAAM,EAAE,IAAIC,EAAE,WAAW,EAAE,EAAED,EAAE,IAAI,IAAI,EAAE,EAAE,EAAE,EAAE,IAAIC,GAAG,EAAE,UAAU,CAAC,GAAGA,EAAEC,KAAK,IAAI,CAACD,EAAE,WAAW,OAAOA,IAAI,CAAC,CAAC,SAAS,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,WAAWA,EAAE,EAAE,EAAE,IAAI,IAAI,EAAE,EAAE,EAAEA,EAAE,IAAI,CAAC,IAAI,EAAE,CAAC,CAAC,EAAE,AAAC,IAAG,IAAI,EAAE,EAAEC,KAAK,IAAI,CAAC,EAAEA,KAAK,IAAI,CAAC,EAAEA,KAAK,IAAI,CAAC,EAAEA,KAAK,IAAI,CAAC,EAAE,WAAW,IAAI,EAAE,IAAI,WAAW,IAAI,GAAG,IAAI,WAAW,IAAI,GAAG,IAAI,UAAU,CAAC,OAAO,IAAI,CAAC,CAAC,SAAS,EAAE,CAAC,EAAE,OAAO,GAAG,GAAG,GAAG,GAAG,IAAI,GAAG,AAAC,IAAG,GAAG,KAAK,CAAC,CAAC,SAASC,EAAE,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,IAAIC,YAAY,GAAG,EAAE,IAAIA,YAAY,GAAG,EAAE,IAAI,EAAE,EAAE,EAAE,IAAI,IAAI,EAAE,EAAE,EAAE,EAAE,IAAI,EAAE,EAAE,GAAG,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,EAAE,GAAG,CAAC,CAAC,EAAE,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC,CAAC,SAAS,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,EAAE,MAAM,AAAC,CAAC,GAAEF,KAAK,IAAI,CAAC,AAAC,GAAEA,KAAK,IAAI,CAAC,EAAE,EAAE,WAAU,EAAG,IAAI,GAAG,WAAU,EAAG,EAAE,IAAI,EAAC,IAAK,CAAC,CAAC,SAAS,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAACD,CAAC,CAAC,CAAC,EAAE,IAAI,IAAI,EAAE,EAAE,EAAE,EAAE,IAAIA,CAAC,CAAC,EAAE,EAAE,CAAC,WAAW,IAAI,EAAE,EAAE,MAAM,CAAC,GAAG,IAAI,EAAE,IAAI,IAAII,EAAE,EAAEA,EAAE,EAAEA,IAAI,CAAC,IAAI,EAAE,CAAC,CAACA,EAAE,CAAC,IAAI,IAAI,EAAE,EAAE,EAAE,EAAE,IAAI,CAAC,IAAI,EAAE,EAAE,EAAE,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,EAAE,CAAE,GAAEJ,CAAC,CAAC,EAAE,EAAE,EAAGA,CAAAA,CAAC,CAAC,EAAE,EAAE,CAAC,EAAE,CAAC,CAAC,CAAC,SAAS,EAAE,CAAC,CAAC,CAAC,CAACD,CAAC,CAACC,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,IAAII,EAAE,EAAE,MAAM,CAACC,EAAE,IAAIF,YAAYC,EAAEL,GAAG,IAAI,IAAIO,EAAE,EAAEA,EAAEF,EAAEE,IAAI,CAAC,SAAS,CAAC,CAAC,CAAC,CAACP,CAAC,CAACC,CAAC,CAAC,CAAC,CAAC,CAAC,CAACI,CAAC,EAAE,IAAI,IAAI,EAAE,EAAE,EAAE,EAAE,IAAI,CAAC,CAACA,EAAE,EAAE,CAAC,WAAW,IAAIC,EAAE,EAAE,MAAM,CAAC,GAAG,IAAIA,EAAE,OAAO,GAAGA,EAAE,EAAE,CAAC,IAAI,EAAEP,EAAE,EAAE,EAAEO,GAAG,IAAI,IAAI,EAAE,EAAE,EAAE,EAAE,IAAI,CAAC,CAACD,EAAE,EAAE,CAAC,EAAE,EAAEL,CAAC,CAAC,EAAE,CAACC,CAAC,CAAC,EAAE,EAAE,MAAM,CAAC,IAAIM,EAAED,EAAE,EAAE,EAAE,IAAI,IAAIA,EAAE,EAAEA,EAAEC,EAAED,IAAI,CAAC,IAAIC,EAAER,EAAE,EAAEO,EAAE,GAAG,IAAI,IAAI,EAAE,EAAE,EAAE,EAAE,IAAI,CAAC,IAAI,EAAE,EAAEC,EAAEP,CAAC,CAAC,EAAE,CAACC,CAAC,CAAC,EAAE,CAAE,GAAE,CAAC,CAACI,EAAE,EAAE,EAAG,EAAC,CAACA,EAAE,EAAE,CAAC,EAAE,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,CAACE,EAAE,EAAE,CAAC,CAACA,EAAE,CAAC,EAAEN,EAAE,EAAED,EAAEM,EAAEC,EAAEP,GAAG,OAAOM,CAAC,CAAC,SAAS,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,IAAIL,EAAE,EAAE,EAAE,EAAE,EAAE,EAAE,EAAE,EAAE,IAAI,IAAI,EAAE,EAAE,EAAE,EAAE,IAAI,CAAC,CAACA,EAAE,EAAE,GAAG,CAAC,CAAC,EAAE,EAAE,EAAE,IAAI,OAAO,EAAE,CAAC,CAAC,SAAS,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,IAAIO,IAAI,IAAI,IAAI,EAAE,EAAE,EAAE,EAAE,IAAI,CAAC,IAAI,EAAE,EAAE,EAAE,EAAE,IAAIC,IAAI,IAAI,IAAI,EAAE,EAAE,EAAE,EAAE,IAAI,CAAC,IAAI,EAAE,EAAE,EAAE,EAAE,EAAE,EAAE,GAAGC,EAAE,EAAE,GAAG,CAAC,EAAGA,CAAAA,GAAIA,CAAAA,EAAE,EAAE,CAAC,EAAE,GAAG,CAAC,EAAEA,EAAC,EAAGA,EAAE,IAAI,CAAC,EAAE,CAAC,IAAI,IAAI,KAAK,EAAE,MAAM,GAAG,GAAG,CAAE,GAAE,MAAM,CAAC,GAAG,IAAI,IAAI,EAAE,EAAE,EAAE,EAAE,MAAM,CAAC,IAAI,IAAI,IAAI,EAAE,EAAE,EAAE,EAAE,EAAE,MAAM,CAAC,IAAW,EAAE,GAAG,CAAE,AAAE,CAAC,CAAC,EAAE,CAAG,EAAE,CAAC,CAAC,EAAE,CAAG,CAAC,OAAO,CAAC,CAAC,SAAS,EAAE,CAAC,CAAC,CAAC,EAAE,MAAM,CAAC,EAAE,EAAE,EAAE,EAAE,EAAE,CAACd,EAAE,CAAC,CAAC,EAAE,CAAC,GAAG,IAAIe,EAAE,GAAG,IAAI,EAAE,GAAG,IAAI,EAAE,GAAG,IAAI,EAAE,GAAG,IAAI,EAAE,GAAG,IAAI,EAAE,GAAG,IAAI,EAAE,GAAG,IAAI,EAAE,GAAG,IAAIZ,EAAE,GAAG,IAAII,EAAE,GAAG,IAAI,EAAE,GAAG,IAAI,EAAE,GAAG,IAAI,EAAE,GAAG,IAAI,CAAC,EAAG,OAAM,EAAE,MAAO,KAAK,aAAY,CAAC,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,IAAIC,YAAY,GAAG,IAAI,CAAC,IAAI,CAAC,IAAIQ,WAAW,GAAG,IAAI,IAAI,EAAE,EAAE,EAAE,EAAE,IAAI,IAAI,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,EAAE,EAAE,KAAK,IAAI,CAAC,MAAM,CAAC,EAAE,GAAG,GAAG,EAAE,IAAI,CAAC,MAAM,CAAC,EAAE,CAAC,KAAK,IAAI,CAAC,MAAM,CAAC,EAAE,GAAG,GAAG,CAAC,IAAI,EAAE,IAAI,CAAC,MAAM,CAAC,EAAE,AAAC,KAAI,CAAC,MAAM,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,OAAO,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE,IAAI,CAAC,IAAI,CAAC,EAAG,KAAI,GAAI,KAAI,CAAC,IAAI,CAAC,EAAE,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,IAAI,CAAC,MAAM,CAAC,EAAE,CAAC,EAAE,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,IAAI,CAAC,MAAM,CAAC,EAAE,CAAC,EAAG,KAAI,CAAC,MAAM,CAAC,EAAE,CAAC,EAAE,IAAI,CAAC,IAAI,CAAC,EAAE,EAAC,CAAC,CAAE,CAAC,UAAU,CAAC,CAAC,CAAC,IAAI,EAAE,IAAIH,IAAI,IAAI,IAAI,KAAK,EAAE,CAAC,IAAI,EAAE,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE,EAAE,GAAG,CAAC,EAAG,IAAI,GAAE,EAAE,CAAC,EAAE,GAAG,CAAC,EAAE,EAAC,EAAG,EAAE,IAAI,CAAC,EAAE,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,IAAI,KAAK,EAAE,MAAM,GAAG,EAAE,MAAM,CAAC,GAAI,GAAE,IAAI,CAAC,CAAC,EAAE,IAAI,EAAE,GAAG,EAAE,IAAI,CAAC,EAAC,EAAG,OAAO,EAAE,IAAI,CAAC,CAAC,EAAE,IAAI,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,SAASE,EAAE,CAAC,CAAC,CAAC,EAAE,IAAIX,EAAE,IAAIQ,IAAIP,EAAE,EAAE,MAAM,CAAC,GAAGA,EAAE,EAAE,OAAOA,EAAE,GAAGD,EAAE,GAAG,CAACD,EAAE,EAAE,EAAEE,IAAID,EAAE,IAAI,EAAEC,EAAE,EAAE,EAAE,IAAI,IAAIA,EAAE,EAAEA,EAAE,EAAEA,IAAID,EAAE,GAAG,CAACD,EAAE,EAAEE,EAAE,IAAI,OAAOD,CAAC,CAAC,SAAS,EAAE,CAAC,EAAE,IAAI,EAAE,IAAII,YAAY,EAAE,IAAI,EAAE,EAAE,EAAE,IAAI,IAAI,KAAK,EAAE,CAAC,CAAC,IAAI,CAAC,EAAE,OAAO,CAAC,CAAC,SAAS,EAAE,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,OAAO,EAAE,GAAG,UAAU,KAAK,GAAG,GAAG,UAAU,OAAO,EAAE,EAAE,OAAO,CAAC,CAAC,UAAU,GAAG,WAAW,GAAG,UAAU,GAAG,WAAW,EAAE,gBAAgB,GAAG,iBAAiB,IAAI,SAAS,GAAG,UAAU,GAAG,SAAS,GAAG,UAAU,EAAE,QAAQ,EAAE,UAAU,GAAG,WAAW,IAAI,WAAW,GAAG,YAAY,KAAK,KAAK,GAAG,MAAM,EAAE,CAAC,CAAC,eAAe,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,IAAI,EAAEH,EAAEY,YAAY,GAAG,GAAGC,EAAE,EAAE,MAAM,CAAC,GAAGA,EAAE,EAAE,MAAM,CAAC,MAAM,EAAE,CAAC,OAAO,EAAE,CAAC,MAAM,CAAC,YAAYD,YAAY,GAAG,GAAGZ,EAAE,WAAWa,EAAE,eAAe,EAAE,eAAe,EAAE,WAAW,CAAC,CAAC,EAAE,GAAG,CAAC,MAAMJ,CAAC,CAAC,MAAM,CAAC,CAAC,CAACP,EAAE,EAAE,eAAe,CAAC,EAAE,IAAI,EAAE,EAAE,EAAE,EAAE,GAAG,EAAE,OAAO,CAAC,GAAGW,GAAG,IAAI,MAAM,EAAE,EAAE,EAAE,SAAS,CAAC,EAAE,eAAe,CAACJ,EAAE,EAAE,EAAE,OAAO,CAAC,EAAE,SAAS,EAAE,EAAE,EAAE,EAAE,SAAS,CAAC,EAAE,eAAe,CAACA,EAAE,EAAE,EAAE,UAAU,EAAEI,EAAE,EAAE,eAAe,CAAC,EAAE,QAAQ,CAAC,EAAE,QAAQ,EAAE,EAAE,EAAE,CAAC,EAAE,IAAI,EAAEA,GAAG,EAAE,IAAIN,IAAI,IAAI,IAAI,KAAK,EAAE,CAAC,GAAG,CAAC,EAAEP,EAAE,CAAC,EAAE,EAAEa,GAAGX,EAAE,EAAE,EAAE,EAAEF,EAAE,EAAE,eAAe,CAAEE,CAAAA,GAAG,EAAE,SAAS,EAAG,GAAE,IAAI,CAAC,CAAC,OAAO,EAAE,OAAOF,EAAE,WAAWE,CAAC,GAAG,EAAE,KAAK,CAAC,EAAEF,GAAG,EAAE,GAAG,CAAC,GAAG,EAAE,GAAG,CAACA,EAAC,CAAE,CAAC,EAAE,IAAI,CAAC,CAAC,EAAE,IAAI,EAAE,UAAU,CAAC,EAAE,UAAU,EAAE,IAAI,EAAE,EAAE,SAAS,CAAC,GAAG,GAAG,CAAC,GAAI,EAAC,QAAQ,CAAC,IAAI,MAAM,CAAC,MAAM,EAAE,OAAO,EAAE,MAAM,CAAC,YAAYY,YAAY,GAAG,GAAGZ,EAAE,WAAWa,EAAE,eAAe,EAAE,IAAI,CAAC,eAAe,EAAE,MAAM,CAAC,WAAW,EAAE,MAAM,CAAC,CAAC,CAAC,SAAS,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAACb,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,CAAE,GAAE,UAAU,EAAG,GAAE,EAAE,UAAU,CAAC,EAAC,EAAG,IAAI,EAAE,EAAEU,EAAE,EAAE,EAAE,SAAS,GAAGL,EAAE,IAAIF,YAAY,EAAE,eAAe,EAAE,EAAE,EAAE,EAAEH,EAAE,EAAE,eAAe,CAACK,EAAE,GAAG,IAAIC,EAAE,EAAE,CAAC,IAAI,IAAI,EAAE,EAAE,EAAE,EAAE,IAAI,CAAC,IAAI,EAAE,EAAE,EAAE,eAAe,CAACN,EAAE,EAAE,IAAI,IAAI,EAAE,EAAE,EAAE,EAAE,eAAe,CAAC,IAAIK,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC,EAAE,EAAE,EAAEL,IAAI,IAAI,EAAEA,EAAE,EAAE,eAAe,AAAC,IAAG,EAAE,SAAS,EAAEM,EAAE,IAAI,CAAC,CAAC,OAAO,GAAG,OAAO,EAAE,WAAW,CAAC,EAAE,CAAC,OAAOA,EAAE,IAAI,CAAC,CAAC,EAAE,IAAI,EAAE,UAAU,CAAC,EAAE,UAAU,EAAEA,CAAC,CAAO,EAAE,EAAE,CAAV,IAAW,EAAE,EAAE,EAAE,AAAG,GAAE,EAAE,CAAG,EAAE,EAAE,CAAG,EAAE,EAAE,CAAG,EAAE,EAAE,CAAG,EAAE,EAAE,CAAG,EAAE,EAAE,CAAG,EAAE,EAAE,CAAG,EAAE,EAAE,CAAG,EAAE,EAAE,CAAG,EAAE,EAAE,CAAG,EAAE,EAAE,CAAG,EAAE,EAAE,CCY95I,IAAMQ,EAAOC,EAAAA,UAAUA,CACjB,CAAEC,QAAAA,CAAO,CAAEC,WAAAA,CAAU,CAAEC,UAAAA,CAAS,CAAEC,gBAAAA,CAAe,CAAE,CAAGL,EAItDM,EAAaC,EACjBL,EAASE,EAAWC,EAJR,IAAIhB,YAAYW,EAAK,KAAK,EAC1B,IAAIX,YAAYW,EAAK,KAAK,EAGa,MAGrDQ,EAAAA,UAAAA,CAAAA,WAAuB,CAAC,CACtB,WAAYC,MAAM,IAAI,CAACH,GACvBH,WAAAA,EACA,MAAOD,EAAQ,MAAM,AACvB,G"}
@@ -0,0 +1,2 @@
1
+ import{parentPort as t,workerData as e}from"node:worker_threads";var r={};r.d=(t,e)=>{for(var n in e)r.o(e,n)&&!r.o(t,n)&&Object.defineProperty(t,n,{enumerable:!0,get:e[n]})},r.o=(t,e)=>Object.prototype.hasOwnProperty.call(t,e);var n={};function i(t,e=0,r=t.length){let n=0x811c9dc5,o=e+r;for(let r=e;r<o;r++)n^=t.charCodeAt(r),n=Math.imul(n,0x1000193);return n>>>0}function o(t,e,r){let n=0x811c9dc5,i=e+r;for(let r=e;r<i;r++){let e=t[r];n^=255&e,n=Math.imul(n=Math.imul(n=Math.imul(n=Math.imul(n,0x1000193)^e>>>8&255,0x1000193)^e>>>16&255,0x1000193)^e>>>24&255,0x1000193)}return n>>>0}function a(t){return t^=t<<13,t^=t>>>17,(t^=t<<5)>>>0}function s(t,e){let r=new Uint32Array(t),n=new Uint32Array(t),i=0===e?1:e;for(let e=0;e<t;e++)i=a(i),r[e]=i,i=a(i),n[e]=i;return{hashA:r,hashB:n}}function u(t,e,r){let n=t;return((n=Math.imul((n=Math.imul(n^e,0x85ebca6b))^n>>>13,0xc2b2ae35))^r^n>>>16)>>>0}function l(t,e,r,n,i,o){for(let t=0;t<n;t++)i[o+t]=0xffffffff;let a=t.length;if(0!==a)for(let s=0;s<a;s++){let a=t[s];for(let t=0;t<n;t++){let n=u(a,e[t],r[t]);n<i[o+t]&&(i[o+t]=n)}}}function f(t,e,r,n,o,a){let s=t.length,l=new Uint32Array(s*r);for(let f=0;f<s;f++)!function(t,e,r,n,o,a,s){for(let t=0;t<o;t++)a[s+t]=0xffffffff;let l=t.length;if(0===l)return;if(l<e){let e=i(t,0,l);for(let t=0;t<o;t++)a[s+t]=u(e,r[t],n[t]);return}let f=l-e+1;for(let l=0;l<f;l++){let f=i(t,l,e);for(let t=0;t<o;t++){let e=u(f,r[t],n[t]);e<a[s+t]&&(a[s+t]=e)}}}(a?a(t[f]):t[f],e,n,o,r,l,f*r);return l}function h(t,e,r,n){let i=e*n,o=r*n,a=0;for(let e=0;e<n;e++)t[i+e]===t[o+e]&&a++;return a/n}function m(t,e,r,n,i){let a=new Set;for(let s=0;s<n;s++){let n=s*i,u=new Map;for(let a=0;a<e;a++){let e=o(t,a*r+n,i),s=u.get(e);s||(s=[],u.set(e,s)),s.push(a)}for(let t of u.values())if(!(t.length<2))for(let r=0;r<t.length;r++)for(let n=r+1;n<t.length;n++)a.add(t[r]*e+t[n])}return a}function p(t,e){return[t/e|0,t%e]}r.d(n,{HP:()=>d,sB:()=>A,wQ:()=>o,eE:()=>m,$i:()=>w,vm:()=>p,eL:()=>y,u5:()=>f,Wh:()=>i,g3:()=>s,_M:()=>h,aW:()=>l,Bu:()=>g,nC:()=>c});class c{parent;rank;constructor(t){this.parent=new Uint32Array(t),this.rank=new Uint8Array(t);for(let e=0;e<t;e++)this.parent[e]=e}find(t){let e=t;for(;this.parent[e]!==e;)e=this.parent[e];for(;this.parent[t]!==e;){let r=this.parent[t];this.parent[t]=e,t=r}return e}union(t,e){let r=this.find(t),n=this.find(e);r!==n&&(this.rank[r]<this.rank[n]?this.parent[r]=n:this.rank[r]>this.rank[n]?this.parent[n]=r:(this.parent[n]=r,this.rank[r]++))}getGroups(t){let e=new Map;for(let r of t){let t=this.find(r),n=e.get(t);n||(n=[],e.set(t,n)),n.push(r)}let r=[];for(let t of e.values())t.length>1&&(t.sort((t,e)=>t-e),r.push(t));return r.sort((t,e)=>t[0]-e[0]),r}}function d(t,e){let r=new Set,n=t.length;if(n<e)return n>0&&r.add(i(t,0,n)),r;let o=n-e+1;for(let n=0;n<o;n++)r.add(i(t,n,e));return r}function g(t){let e=new Uint32Array(t.size),r=0;for(let n of t)e[r++]=n;return e}function w(t,e){let r;return r=t?.workers===void 0||t?.workers==="auto"?e:t.workers,{threshold:t?.threshold??.8,ngramSize:t?.ngramSize??3,numPermutations:t?.numPermutations??128,numBands:t?.numBands??32,bandSize:t?.bandSize??4,workers:r,batchSize:t?.batchSize??1e3,preprocess:t?.preprocess??null,seed:t?.seed??42}}async function y(t,e,r){let n,i=performance.now(),o=t.length;if(o<2)return{pairs:[],groups:[],stats:{totalTimeMs:performance.now()-i,inputCount:o,candidateCount:0,duplicateCount:0,groupCount:0}};let{hashA:a,hashB:u}=s(e.numPermutations,e.seed),l=m(n=r&&e.workers>0&&o>=5e3?await r(t,e.ngramSize,e.numPermutations,a,u,e.workers,e.batchSize):f(t,e.ngramSize,e.numPermutations,a,u,e.preprocess),o,e.numPermutations,e.numBands,e.bandSize),d=[],g=new c(o),w=new Set;for(let t of l){let[r,i]=p(t,o),a=h(n,r,i,e.numPermutations);a>=e.threshold&&(d.push({indexA:r,indexB:i,similarity:a}),g.union(r,i),w.add(r),w.add(i))}d.sort((t,e)=>e.similarity-t.similarity);let y=g.getGroups(w).map(t=>({indices:t}));return{pairs:d,groups:y,stats:{totalTimeMs:performance.now()-i,inputCount:o,candidateCount:l.size,duplicateCount:d.length,groupCount:y.length}}}function A(t,e,r,n,i,o){let a=t;e.preprocess&&(a=e.preprocess(a));let s=g(d(a,e.ngramSize)),u=new Uint32Array(e.numPermutations);l(s,n,i,e.numPermutations,u,0);let f=[];for(let t=0;t<o;t++){let n=t*e.numPermutations,i=0;for(let t=0;t<e.numPermutations;t++)u[t]===r[n+t]&&i++;let o=i/e.numPermutations;o>=e.threshold&&f.push({indexA:-1,indexB:t,similarity:o})}return f.sort((t,e)=>e.similarity-t.similarity),f}n.nC;var P=n.u5;n.aW,n.vm,n._M,n.eL,n.eE,n.Wh,n.wQ,n.g3,n.sB,n.$i,n.HP,n.Bu;let{strings:x,startIndex:k,ngramSize:M,numPermutations:S}=e,z=P(x,M,S,new Uint32Array(e.hashA),new Uint32Array(e.hashB),null);t.postMessage({signatures:Array.from(z),startIndex:k,count:x.length});
2
+ //# sourceMappingURL=minhash-worker.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"minhash-worker.js","sources":["webpack://@mattgrill/nearline-node/../core/dist/index.js","webpack://@mattgrill/nearline-node/./src/workers/minhash-worker.ts"],"sourcesContent":["var e={};e.d=(t,n)=>{for(var r in n)e.o(n,r)&&!e.o(t,r)&&Object.defineProperty(t,r,{enumerable:!0,get:n[r]})},e.o=(e,t)=>Object.prototype.hasOwnProperty.call(e,t);var t={};function n(e,t=0,r=e.length){let i=0x811c9dc5,a=t+r;for(let n=t;n<a;n++)i^=e.charCodeAt(n),i=Math.imul(i,0x1000193);return i>>>0}function r(e,t,n){let r=0x811c9dc5,i=t+n;for(let n=t;n<i;n++){let t=e[n];r^=255&t,r=Math.imul(r=Math.imul(r=Math.imul(r=Math.imul(r,0x1000193)^t>>>8&255,0x1000193)^t>>>16&255,0x1000193)^t>>>24&255,0x1000193)}return r>>>0}function i(e){return e^=e<<13,e^=e>>>17,(e^=e<<5)>>>0}function a(e,t){let n=new Uint32Array(e),r=new Uint32Array(e),a=0===t?1:t;for(let t=0;t<e;t++)a=i(a),n[t]=a,a=i(a),r[t]=a;return{hashA:n,hashB:r}}function o(e,t,n){let r=e;return((r=Math.imul((r=Math.imul(r^t,0x85ebca6b))^r>>>13,0xc2b2ae35))^n^r>>>16)>>>0}function s(e,t,n,r,i,a){for(let e=0;e<r;e++)i[a+e]=0xffffffff;let s=e.length;if(0!==s)for(let u=0;u<s;u++){let s=e[u];for(let e=0;e<r;e++){let r=o(s,t[e],n[e]);r<i[a+e]&&(i[a+e]=r)}}}function u(e,t,r,i,a,s){let u=e.length,l=new Uint32Array(u*r);for(let f=0;f<u;f++)!function(e,t,r,i,a,s,u){for(let e=0;e<a;e++)s[u+e]=0xffffffff;let l=e.length;if(0===l)return;if(l<t){let t=n(e,0,l);for(let e=0;e<a;e++)s[u+e]=o(t,r[e],i[e]);return}let f=l-t+1;for(let l=0;l<f;l++){let f=n(e,l,t);for(let e=0;e<a;e++){let t=o(f,r[e],i[e]);t<s[u+e]&&(s[u+e]=t)}}}(s?s(e[f]):e[f],t,i,a,r,l,f*r);return l}function l(e,t,n,r){let i=t*r,a=n*r,o=0;for(let t=0;t<r;t++)e[i+t]===e[a+t]&&o++;return o/r}function f(e,t,n,i,a){let o=new Set;for(let u=0;u<i;u++){let i=u*a,l=new Map;for(let o=0;o<t;o++){let t=r(e,o*n+i,a),s=l.get(t);s||(s=[],l.set(t,s)),s.push(o)}for(let e of l.values())if(!(e.length<2))for(let n=0;n<e.length;n++)for(let r=n+1;r<e.length;r++){var s;o.add((s=e[n],s*t+e[r]))}}return o}function p(e,t){return[e/t|0,e%t]}e.d(t,{HP:()=>c,sB:()=>g,wQ:()=>r,eE:()=>f,$i:()=>m,vm:()=>p,eL:()=>d,u5:()=>u,Wh:()=>n,g3:()=>a,_M:()=>l,aW:()=>s,Bu:()=>h,nC:()=>_});class _{parent;rank;constructor(e){this.parent=new Uint32Array(e),this.rank=new Uint8Array(e);for(let t=0;t<e;t++)this.parent[t]=t}find(e){let t=e;for(;this.parent[t]!==t;)t=this.parent[t];for(;this.parent[e]!==t;){let n=this.parent[e];this.parent[e]=t,e=n}return t}union(e,t){let n=this.find(e),r=this.find(t);n!==r&&(this.rank[n]<this.rank[r]?this.parent[n]=r:this.rank[n]>this.rank[r]?this.parent[r]=n:(this.parent[r]=n,this.rank[n]++))}getGroups(e){let t=new Map;for(let n of e){let e=this.find(n),r=t.get(e);r||(r=[],t.set(e,r)),r.push(n)}let n=[];for(let e of t.values())e.length>1&&(e.sort((e,t)=>e-t),n.push(e));return n.sort((e,t)=>e[0]-t[0]),n}}function c(e,t){let r=new Set,i=e.length;if(i<t)return i>0&&r.add(n(e,0,i)),r;let a=i-t+1;for(let i=0;i<a;i++)r.add(n(e,i,t));return r}function h(e){let t=new Uint32Array(e.size),n=0;for(let r of e)t[n++]=r;return t}function m(e,t){let n;return n=e?.workers===void 0||e?.workers===\"auto\"?t:e.workers,{threshold:e?.threshold??.8,ngramSize:e?.ngramSize??3,numPermutations:e?.numPermutations??128,numBands:e?.numBands??32,bandSize:e?.bandSize??4,workers:n,batchSize:e?.batchSize??1e3,preprocess:e?.preprocess??null,seed:e?.seed??42}}async function d(e,t,n){let r,i=performance.now(),o=e.length;if(o<2)return{pairs:[],groups:[],stats:{totalTimeMs:performance.now()-i,inputCount:o,candidateCount:0,duplicateCount:0,groupCount:0}};let{hashA:s,hashB:c}=a(t.numPermutations,t.seed),h=f(r=n&&t.workers>0&&o>=5e3?await n(e,t.ngramSize,t.numPermutations,s,c,t.workers,t.batchSize):u(e,t.ngramSize,t.numPermutations,s,c,t.preprocess),o,t.numPermutations,t.numBands,t.bandSize),m=[],d=new _(o),g=new Set;for(let e of h){let[n,i]=p(e,o),a=l(r,n,i,t.numPermutations);a>=t.threshold&&(m.push({indexA:n,indexB:i,similarity:a}),d.union(n,i),g.add(n),g.add(i))}m.sort((e,t)=>t.similarity-e.similarity);let w=d.getGroups(g).map(e=>({indices:e}));return{pairs:m,groups:w,stats:{totalTimeMs:performance.now()-i,inputCount:o,candidateCount:h.size,duplicateCount:m.length,groupCount:w.length}}}function g(e,t,n,r,i,a){let o=e;t.preprocess&&(o=t.preprocess(o));let u=h(c(o,t.ngramSize)),l=new Uint32Array(t.numPermutations);s(u,r,i,t.numPermutations,l,0);let f=[];for(let e=0;e<a;e++){let r=e*t.numPermutations,i=0;for(let e=0;e<t.numPermutations;e++)l[e]===n[r+e]&&i++;let a=i/t.numPermutations;a>=t.threshold&&f.push({indexA:-1,indexB:e,similarity:a})}return f.sort((e,t)=>t.similarity-e.similarity),f}var w=t.nC,x=t.u5,b=t.aW,k=t.vm,S=t._M,y=t.eL,P=t.eE,v=t.Wh,C=t.wQ,A=t.g3,z=t.sB,M=t.$i,U=t.HP,B=t.Bu;export{w as UnionFind,x as computeAllSignatures,b as computeSignature,k as decodePair,S as estimateSimilarity,y as executePipeline,P as findCandidates,v as fnv1aString,C as fnv1aUint32,A as generateHashCoefficients,z as querySignatures,M as resolveOptions,U as shingle,B as shingleToArray};\n//# sourceMappingURL=index.js.map","import { parentPort, workerData } from \"node:worker_threads\";\nimport { computeAllSignatures } from \"@mattgrill/nearline-core\";\n\ninterface WorkerInput {\n strings: string[];\n startIndex: number;\n ngramSize: number;\n numPermutations: number;\n hashA: number[];\n hashB: number[];\n}\n\nconst data = workerData as WorkerInput;\nconst { strings, startIndex, ngramSize, numPermutations } = data;\nconst hashA = new Uint32Array(data.hashA);\nconst hashB = new Uint32Array(data.hashB);\n\nconst signatures = computeAllSignatures(\n strings, ngramSize, numPermutations, hashA, hashB, null\n);\n\nparentPort!.postMessage({\n signatures: Array.from(signatures),\n startIndex,\n count: strings.length,\n});\n"],"names":["t","Object","r","Math","i","Uint32Array","l","f","Set","Map","s","p","Uint8Array","performance","o","a","strings","startIndex","ngramSize","numPermutations","workerData","signatures","computeAllSignatures","data","parentPort","Array"],"mappings":"iEAAA,IAAI,EAAE,CAAC,CAAE,GAAE,CAAC,CAAC,CAACA,EAAE,KAAK,IAAI,IAAI,KAAK,EAAE,EAAE,CAAC,CAAC,EAAE,IAAI,CAAC,EAAE,CAAC,CAACA,EAAE,IAAIC,OAAO,cAAc,CAACD,EAAE,EAAE,CAAC,WAAW,CAAC,EAAE,IAAI,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,IAAIC,OAAO,SAAS,CAAC,cAAc,CAAC,IAAI,CAAC,EAAE,GAAG,IAAI,EAAE,CAAC,EAAE,SAAS,EAAE,CAAC,CAAC,EAAE,CAAC,CAACC,EAAE,EAAE,MAAM,EAAE,IAAI,EAAE,WAAW,EAAE,EAAEA,EAAE,IAAI,IAAI,EAAE,EAAE,EAAE,EAAE,IAAI,GAAG,EAAE,UAAU,CAAC,GAAG,EAAEC,KAAK,IAAI,CAAC,EAAE,WAAW,OAAO,IAAI,CAAC,CAAC,SAAS,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,WAAWC,EAAE,EAAE,EAAE,IAAI,IAAI,EAAE,EAAE,EAAEA,EAAE,IAAI,CAAC,IAAI,EAAE,CAAC,CAAC,EAAE,AAAC,IAAG,IAAI,EAAE,EAAED,KAAK,IAAI,CAAC,EAAEA,KAAK,IAAI,CAAC,EAAEA,KAAK,IAAI,CAAC,EAAEA,KAAK,IAAI,CAAC,EAAE,WAAW,IAAI,EAAE,IAAI,WAAW,IAAI,GAAG,IAAI,WAAW,IAAI,GAAG,IAAI,UAAU,CAAC,OAAO,IAAI,CAAC,CAAC,SAAS,EAAE,CAAC,EAAE,OAAO,GAAG,GAAG,GAAG,GAAG,IAAI,GAAG,AAAC,IAAG,GAAG,KAAK,CAAC,CAAC,SAAS,EAAE,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,IAAIE,YAAY,GAAG,EAAE,IAAIA,YAAY,GAAG,EAAE,IAAI,EAAE,EAAE,EAAE,IAAI,IAAI,EAAE,EAAE,EAAE,EAAE,IAAI,EAAE,EAAE,GAAG,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,EAAE,GAAG,CAAC,CAAC,EAAE,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC,CAAC,SAAS,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,EAAE,MAAM,AAAC,CAAC,GAAEF,KAAK,IAAI,CAAC,AAAC,GAAEA,KAAK,IAAI,CAAC,EAAE,EAAE,WAAU,EAAG,IAAI,GAAG,WAAU,EAAG,EAAE,IAAI,EAAC,IAAK,CAAC,CAAC,SAAS,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAACC,CAAC,CAAC,CAAC,EAAE,IAAI,IAAI,EAAE,EAAE,EAAE,EAAE,IAAIA,CAAC,CAAC,EAAE,EAAE,CAAC,WAAW,IAAI,EAAE,EAAE,MAAM,CAAC,GAAG,IAAI,EAAE,IAAI,IAAI,EAAE,EAAE,EAAE,EAAE,IAAI,CAAC,IAAI,EAAE,CAAC,CAAC,EAAE,CAAC,IAAI,IAAI,EAAE,EAAE,EAAE,EAAE,IAAI,CAAC,IAAI,EAAE,EAAE,EAAE,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,EAAE,CAAE,GAAEA,CAAC,CAAC,EAAE,EAAE,EAAGA,CAAAA,CAAC,CAAC,EAAE,EAAE,CAAC,EAAE,CAAC,CAAC,CAAC,SAAS,EAAE,CAAC,CAAC,CAAC,CAACF,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,EAAE,MAAM,CAACI,EAAE,IAAID,YAAY,EAAEH,GAAG,IAAI,IAAIK,EAAE,EAAEA,EAAE,EAAEA,IAAI,CAAC,SAAS,CAAC,CAAC,CAAC,CAACL,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,IAAI,IAAI,EAAE,EAAE,EAAE,EAAE,IAAI,CAAC,CAAC,EAAE,EAAE,CAAC,WAAW,IAAII,EAAE,EAAE,MAAM,CAAC,GAAG,IAAIA,EAAE,OAAO,GAAGA,EAAE,EAAE,CAAC,IAAI,EAAE,EAAE,EAAE,EAAEA,GAAG,IAAI,IAAI,EAAE,EAAE,EAAE,EAAE,IAAI,CAAC,CAAC,EAAE,EAAE,CAAC,EAAE,EAAEJ,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,EAAE,EAAE,MAAM,CAAC,IAAIK,EAAED,EAAE,EAAE,EAAE,IAAI,IAAIA,EAAE,EAAEA,EAAEC,EAAED,IAAI,CAAC,IAAIC,EAAE,EAAE,EAAED,EAAE,GAAG,IAAI,IAAI,EAAE,EAAE,EAAE,EAAE,IAAI,CAAC,IAAI,EAAE,EAAEC,EAAEL,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,EAAE,CAAE,GAAE,CAAC,CAAC,EAAE,EAAE,EAAG,EAAC,CAAC,EAAE,EAAE,CAAC,EAAE,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,CAACK,EAAE,EAAE,CAAC,CAACA,EAAE,CAAC,EAAE,EAAE,EAAEL,EAAEI,EAAEC,EAAEL,GAAG,OAAOI,CAAC,CAAC,SAAS,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,IAAIF,EAAE,EAAE,EAAE,EAAE,EAAE,EAAE,EAAE,EAAE,IAAI,IAAI,EAAE,EAAE,EAAE,EAAE,IAAI,CAAC,CAACA,EAAE,EAAE,GAAG,CAAC,CAAC,EAAE,EAAE,EAAE,IAAI,OAAO,EAAE,CAAC,CAAC,SAAS,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,IAAII,IAAI,IAAI,IAAI,EAAE,EAAE,EAAE,EAAE,IAAI,CAAC,IAAI,EAAE,EAAE,EAAE,EAAE,IAAIC,IAAI,IAAI,IAAI,EAAE,EAAE,EAAE,EAAE,IAAI,CAAC,IAAI,EAAE,EAAE,EAAE,EAAE,EAAE,EAAE,GAAGC,EAAE,EAAE,GAAG,CAAC,EAAGA,CAAAA,GAAIA,CAAAA,EAAE,EAAE,CAAC,EAAE,GAAG,CAAC,EAAEA,EAAC,EAAGA,EAAE,IAAI,CAAC,EAAE,CAAC,IAAI,IAAI,KAAK,EAAE,MAAM,GAAG,GAAG,CAAE,GAAE,MAAM,CAAC,GAAG,IAAI,IAAI,EAAE,EAAE,EAAE,EAAE,MAAM,CAAC,IAAI,IAAI,IAAI,EAAE,EAAE,EAAE,EAAE,EAAE,MAAM,CAAC,IAAW,EAAE,GAAG,CAAE,AAAE,CAAC,CAAC,EAAE,CAAG,EAAE,CAAC,CAAC,EAAE,CAAG,CAAC,OAAO,CAAC,CAAC,SAASC,EAAE,CAAC,CAAC,CAAC,EAAE,MAAM,CAAC,EAAE,EAAE,EAAE,EAAE,EAAE,CAAC,EAAE,CAAC,CAAC,EAAE,CAAC,GAAG,IAAI,EAAE,GAAG,IAAI,EAAE,GAAG,IAAI,EAAE,GAAG,IAAI,EAAE,GAAG,IAAI,EAAE,GAAG,IAAIA,EAAE,GAAG,IAAI,EAAE,GAAG,IAAI,EAAE,GAAG,IAAI,EAAE,GAAG,IAAI,EAAE,GAAG,IAAI,EAAE,GAAG,IAAI,EAAE,GAAG,IAAI,EAAE,GAAG,IAAI,CAAC,EAAG,OAAM,EAAE,MAAO,KAAK,aAAY,CAAC,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,IAAIN,YAAY,GAAG,IAAI,CAAC,IAAI,CAAC,IAAIO,WAAW,GAAG,IAAI,IAAI,EAAE,EAAE,EAAE,EAAE,IAAI,IAAI,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,EAAE,EAAE,KAAK,IAAI,CAAC,MAAM,CAAC,EAAE,GAAG,GAAG,EAAE,IAAI,CAAC,MAAM,CAAC,EAAE,CAAC,KAAK,IAAI,CAAC,MAAM,CAAC,EAAE,GAAG,GAAG,CAAC,IAAI,EAAE,IAAI,CAAC,MAAM,CAAC,EAAE,AAAC,KAAI,CAAC,MAAM,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,OAAO,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE,IAAI,CAAC,IAAI,CAAC,EAAG,KAAI,GAAI,KAAI,CAAC,IAAI,CAAC,EAAE,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,IAAI,CAAC,MAAM,CAAC,EAAE,CAAC,EAAE,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,IAAI,CAAC,MAAM,CAAC,EAAE,CAAC,EAAG,KAAI,CAAC,MAAM,CAAC,EAAE,CAAC,EAAE,IAAI,CAAC,IAAI,CAAC,EAAE,EAAC,CAAC,CAAE,CAAC,UAAU,CAAC,CAAC,CAAC,IAAI,EAAE,IAAIH,IAAI,IAAI,IAAI,KAAK,EAAE,CAAC,IAAI,EAAE,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE,EAAE,GAAG,CAAC,EAAG,IAAI,GAAE,EAAE,CAAC,EAAE,GAAG,CAAC,EAAE,EAAC,EAAG,EAAE,IAAI,CAAC,EAAE,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,IAAI,KAAK,EAAE,MAAM,GAAG,EAAE,MAAM,CAAC,GAAI,GAAE,IAAI,CAAC,CAAC,EAAE,IAAI,EAAE,GAAG,EAAE,IAAI,CAAC,EAAC,EAAG,OAAO,EAAE,IAAI,CAAC,CAAC,EAAE,IAAI,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,SAAS,EAAE,CAAC,CAAC,CAAC,EAAE,IAAIP,EAAE,IAAIM,IAAI,EAAE,EAAE,MAAM,CAAC,GAAG,EAAE,EAAE,OAAO,EAAE,GAAGN,EAAE,GAAG,CAAC,EAAE,EAAE,EAAE,IAAIA,EAAE,IAAI,EAAE,EAAE,EAAE,EAAE,IAAI,IAAI,EAAE,EAAE,EAAE,EAAE,IAAIA,EAAE,GAAG,CAAC,EAAE,EAAE,EAAE,IAAI,OAAOA,CAAC,CAAC,SAAS,EAAE,CAAC,EAAE,IAAI,EAAE,IAAIG,YAAY,EAAE,IAAI,EAAE,EAAE,EAAE,IAAI,IAAI,KAAK,EAAE,CAAC,CAAC,IAAI,CAAC,EAAE,OAAO,CAAC,CAAC,SAAS,EAAE,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,OAAO,EAAE,GAAG,UAAU,KAAK,GAAG,GAAG,UAAU,OAAO,EAAE,EAAE,OAAO,CAAC,CAAC,UAAU,GAAG,WAAW,GAAG,UAAU,GAAG,WAAW,EAAE,gBAAgB,GAAG,iBAAiB,IAAI,SAAS,GAAG,UAAU,GAAG,SAAS,GAAG,UAAU,EAAE,QAAQ,EAAE,UAAU,GAAG,WAAW,IAAI,WAAW,GAAG,YAAY,KAAK,KAAK,GAAG,MAAM,EAAE,CAAC,CAAC,eAAe,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,IAAI,EAAED,EAAES,YAAY,GAAG,GAAGC,EAAE,EAAE,MAAM,CAAC,GAAGA,EAAE,EAAE,MAAM,CAAC,MAAM,EAAE,CAAC,OAAO,EAAE,CAAC,MAAM,CAAC,YAAYD,YAAY,GAAG,GAAGT,EAAE,WAAWU,EAAE,eAAe,EAAE,eAAe,EAAE,WAAW,CAAC,CAAC,EAAE,GAAG,CAAC,MAAM,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,eAAe,CAAC,EAAE,IAAI,EAAE,EAAE,EAAE,EAAE,GAAG,EAAE,OAAO,CAAC,GAAGA,GAAG,IAAI,MAAM,EAAE,EAAE,EAAE,SAAS,CAAC,EAAE,eAAe,CAAC,EAAE,EAAE,EAAE,OAAO,CAAC,EAAE,SAAS,EAAE,EAAE,EAAE,EAAE,SAAS,CAAC,EAAE,eAAe,CAAC,EAAE,EAAE,EAAE,UAAU,EAAEA,EAAE,EAAE,eAAe,CAAC,EAAE,QAAQ,CAAC,EAAE,QAAQ,EAAE,EAAE,EAAE,CAAC,EAAE,IAAI,EAAEA,GAAG,EAAE,IAAIN,IAAI,IAAI,IAAI,KAAK,EAAE,CAAC,GAAG,CAAC,EAAEJ,EAAE,CAACO,EAAE,EAAEG,GAAGC,EAAE,EAAE,EAAE,EAAEX,EAAE,EAAE,eAAe,CAAEW,CAAAA,GAAG,EAAE,SAAS,EAAG,GAAE,IAAI,CAAC,CAAC,OAAO,EAAE,OAAOX,EAAE,WAAWW,CAAC,GAAG,EAAE,KAAK,CAAC,EAAEX,GAAG,EAAE,GAAG,CAAC,GAAG,EAAE,GAAG,CAACA,EAAC,CAAE,CAAC,EAAE,IAAI,CAAC,CAAC,EAAE,IAAI,EAAE,UAAU,CAAC,EAAE,UAAU,EAAE,IAAI,EAAE,EAAE,SAAS,CAAC,GAAG,GAAG,CAAC,GAAI,EAAC,QAAQ,CAAC,IAAI,MAAM,CAAC,MAAM,EAAE,OAAO,EAAE,MAAM,CAAC,YAAYS,YAAY,GAAG,GAAGT,EAAE,WAAWU,EAAE,eAAe,EAAE,IAAI,CAAC,eAAe,EAAE,MAAM,CAAC,WAAW,EAAE,MAAM,CAAC,CAAC,CAAC,SAAS,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAACV,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,CAAE,GAAE,UAAU,EAAG,GAAE,EAAE,UAAU,CAAC,EAAC,EAAG,IAAI,EAAE,EAAE,EAAE,EAAE,EAAE,SAAS,GAAG,EAAE,IAAIC,YAAY,EAAE,eAAe,EAAE,EAAE,EAAE,EAAED,EAAE,EAAE,eAAe,CAAC,EAAE,GAAG,IAAIG,EAAE,EAAE,CAAC,IAAI,IAAI,EAAE,EAAE,EAAE,EAAE,IAAI,CAAC,IAAI,EAAE,EAAE,EAAE,eAAe,CAACH,EAAE,EAAE,IAAI,IAAI,EAAE,EAAE,EAAE,EAAE,eAAe,CAAC,IAAI,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC,EAAE,EAAE,EAAEA,IAAI,IAAI,EAAEA,EAAE,EAAE,eAAe,AAAC,IAAG,EAAE,SAAS,EAAEG,EAAE,IAAI,CAAC,CAAC,OAAO,GAAG,OAAO,EAAE,WAAW,CAAC,EAAE,CAAC,OAAOA,EAAE,IAAI,CAAC,CAAC,EAAE,IAAI,EAAE,UAAU,CAAC,EAAE,UAAU,EAAEA,CAAC,CAAO,EAAE,EAAE,CAAV,IAAW,EAAE,EAAE,EAAE,AAAG,GAAE,EAAE,CAAG,EAAE,EAAE,CAAG,EAAE,EAAE,CAAG,EAAE,EAAE,CAAG,EAAE,EAAE,CAAG,EAAE,EAAE,CAAG,EAAE,EAAE,CAAG,EAAE,EAAE,CAAG,EAAE,EAAE,CAAG,EAAE,EAAE,CAAG,EAAE,EAAE,CAAG,EAAE,EAAE,CCa95I,GAAM,CAAES,QAAAA,CAAO,CAAEC,WAAAA,CAAU,CAAEC,UAAAA,CAAS,CAAEC,gBAAAA,CAAe,CAAE,CAD5CC,EAKPC,EAAaC,EACjBN,EAASE,EAAWC,EAJR,IAAId,YAAYkB,AAFjBH,EAEsB,KAAK,EAC1B,IAAIf,YAAYkB,AAHjBH,EAGsB,KAAK,EAGa,MAGrDI,EAAAA,WAAuB,CAAC,CACtB,WAAYC,MAAM,IAAI,CAACJ,GACvBJ,WAAAA,EACA,MAAOD,EAAQ,MAAM,AACvB"}
@@ -0,0 +1,2 @@
1
+ export {};
2
+ //# sourceMappingURL=minhash-worker.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"minhash-worker.d.ts","sourceRoot":"","sources":["../../src/workers/minhash-worker.ts"],"names":[],"mappings":""}
@@ -0,0 +1,7 @@
1
+ import type { ParallelComputeFn } from "@mattgrill/nearline-core";
2
+ /**
3
+ * Create a parallel compute function using Node.js worker_threads.
4
+ * Returns null if the compiled worker file is not available.
5
+ */
6
+ export declare function createNodeWorkerCompute(): ParallelComputeFn | null;
7
+ //# sourceMappingURL=pool.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"pool.d.ts","sourceRoot":"","sources":["../../src/workers/pool.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EAAE,iBAAiB,EAAE,MAAM,0BAA0B,CAAC;AAYlE;;;GAGG;AACH,wBAAgB,uBAAuB,IAAI,iBAAiB,GAAG,IAAI,CA+ElE"}
package/package.json ADDED
@@ -0,0 +1,51 @@
1
+ {
2
+ "name": "@mattgrill/nearline-node",
3
+ "version": "1.0.0",
4
+ "description": "Node.js-optimized near-duplicate string detection using MinHash + LSH with worker_threads",
5
+ "type": "module",
6
+ "main": "./dist/index.cjs",
7
+ "module": "./dist/index.js",
8
+ "types": "./dist/index.d.ts",
9
+ "exports": {
10
+ ".": {
11
+ "types": "./dist/index.d.ts",
12
+ "import": "./dist/index.js",
13
+ "require": "./dist/index.cjs"
14
+ },
15
+ "./worker": {
16
+ "import": "./dist/minhash-worker.js",
17
+ "require": "./dist/minhash-worker.cjs"
18
+ }
19
+ },
20
+ "files": [
21
+ "dist"
22
+ ],
23
+ "scripts": {
24
+ "build": "rm -rf dist && rspack build -c rspack.config.mjs && tsc --emitDeclarationOnly --outDir dist",
25
+ "lint": "tsc --noEmit"
26
+ },
27
+ "engines": {
28
+ "node": ">=18.0.0"
29
+ },
30
+ "dependencies": {
31
+ "@mattgrill/nearline-core": "^1.0.0"
32
+ },
33
+ "devDependencies": {
34
+ "@types/node": "^20.0.0"
35
+ },
36
+ "repository": {
37
+ "type": "git",
38
+ "url": "https://github.com/matthewgrill/duplicate-finder.git",
39
+ "directory": "packages/server"
40
+ },
41
+ "author": "Matthew Grill",
42
+ "license": "MIT",
43
+ "keywords": [
44
+ "duplicate",
45
+ "deduplication",
46
+ "minhash",
47
+ "lsh",
48
+ "node",
49
+ "worker-threads"
50
+ ]
51
+ }