@mattgrill/nearline-node 1.0.0 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +1 -1
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +1 -1
- package/dist/index.js.map +1 -1
- package/package.json +3 -3
package/dist/index.cjs
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
(()=>{"use strict";var t={};t.d=(e,s)=>{for(var r in s)t.o(s,r)&&!t.o(e,r)&&Object.defineProperty(e,r,{enumerable:!0,get:s[r]})},t.o=(t,e)=>Object.prototype.hasOwnProperty.call(t,e),t.r=t=>{"u">typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(t,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(t,"__esModule",{value:!0})};var e={};t.r(e),t.d(e,{DuplicateFinder:()=>l,findDuplicates:()=>h});let s=require("node:os"),r=require("@mattgrill/nearline-core"),i=require("node:worker_threads"),n=require("node:path"),o=require("node:url");function a(){let t;try{t=function(){let t;try{t=(0,n.dirname)((0,o.fileURLToPath)("file:///
|
|
1
|
+
(()=>{"use strict";var t={};t.d=(e,s)=>{for(var r in s)t.o(s,r)&&!t.o(e,r)&&Object.defineProperty(e,r,{enumerable:!0,get:s[r]})},t.o=(t,e)=>Object.prototype.hasOwnProperty.call(t,e),t.r=t=>{"u">typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(t,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(t,"__esModule",{value:!0})};var e={};t.r(e),t.d(e,{DuplicateFinder:()=>l,findDuplicates:()=>h});let s=require("node:os"),r=require("@mattgrill/nearline-core"),i=require("node:worker_threads"),n=require("node:path"),o=require("node:url");function a(){let t;try{t=function(){let t;try{t=(0,n.dirname)((0,o.fileURLToPath)("file:///home/runner/work/nearline/nearline/packages/server/src/workers/pool.ts"))}catch{t=__dirname}return(0,n.join)(t,"minhash-worker.cjs")}()}catch{return null}return async(e,s,r,n,o,a,h)=>{let l=e.length,u=new Uint32Array(l*r),c=[];for(let t=0;t<l;t+=h)c.push({start:t,end:Math.min(t+h,l)});let p=Math.min(a,c.length),g=0,d=()=>new Promise((a,h)=>{let l=()=>{if(g>=c.length)return void a();let p=c[g++],d=e.slice(p.start,p.end),m=new i.Worker(t,{workerData:{strings:d,startIndex:p.start,ngramSize:s,numPermutations:r,hashA:Array.from(n),hashB:Array.from(o)}});m.on("message",t=>{let e=new Uint32Array(t.signatures),s=t.startIndex*r;u.set(e,s),m.terminate().then(l).catch(h)}),m.on("error",h)};l()}),m=[];for(let t=0;t<p;t++)m.push(d());return await Promise.all(m),u}}async function h(t,e){let i=Math.max(1,(0,s.cpus)().length-1),n=(0,r.resolveOptions)(e,i),o=a();return(0,r.executePipeline)(t,n,o)}class l{options;strings=[];signatures=null;hashA;hashB;constructor(t){const e=Math.max(1,(0,s.cpus)().length-1);this.options=(0,r.resolveOptions)(t,e);const{hashA:i,hashB:n}=(0,r.generateHashCoefficients)(this.options.numPermutations,this.options.seed);this.hashA=i,this.hashB=n}addStrings(t){this.strings.push(...t),this.signatures=null}async buildIndex(){let t=a();t&&this.options.workers>0&&this.strings.length>=5e3?this.signatures=await t(this.strings,this.options.ngramSize,this.options.numPermutations,this.hashA,this.hashB,this.options.workers,this.options.batchSize):this.signatures=(0,r.computeAllSignatures)(this.strings,this.options.ngramSize,this.options.numPermutations,this.hashA,this.hashB,this.options.preprocess)}query(t){if(!this.signatures)throw Error("Index not built. Call buildIndex() first.");return(0,r.querySignatures)(t,this.options,this.signatures,this.hashA,this.hashB,this.strings.length)}}module.exports=e})();
|
|
2
2
|
//# sourceMappingURL=index.cjs.map
|
package/dist/index.cjs.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.cjs","sources":["webpack://@mattgrill/nearline-node/webpack/runtime/define_property_getters","webpack://@mattgrill/nearline-node/webpack/runtime/has_own_property","webpack://@mattgrill/nearline-node/webpack/runtime/make_namespace_object","webpack://@mattgrill/nearline-node/./src/workers/pool.ts","webpack://@mattgrill/nearline-node/./src/finder.ts"],"sourcesContent":["__webpack_require__.d = (exports, definition) => {\n\tfor(var key in definition) {\n if(__webpack_require__.o(definition, key) && !__webpack_require__.o(exports, key)) {\n Object.defineProperty(exports, key, { enumerable: true, get: definition[key] });\n }\n }\n};","__webpack_require__.o = (obj, prop) => (Object.prototype.hasOwnProperty.call(obj, prop))","// define __esModule on exports\n__webpack_require__.r = (exports) => {\n\tif(typeof Symbol !== 'undefined' && Symbol.toStringTag) {\n\t\tObject.defineProperty(exports, Symbol.toStringTag, { value: 'Module' });\n\t}\n\tObject.defineProperty(exports, '__esModule', { value: true });\n};","import { Worker } from \"node:worker_threads\";\nimport { join, dirname } from \"node:path\";\nimport { fileURLToPath } from \"node:url\";\nimport type { ParallelComputeFn } from \"@mattgrill/nearline-core\";\n\nfunction resolveWorkerPath(): string {\n let currentDir: string;\n try {\n currentDir = dirname(fileURLToPath(import.meta.url));\n } catch {\n currentDir = __dirname;\n }\n return join(currentDir, \"minhash-worker.cjs\");\n}\n\n/**\n * Create a parallel compute function using Node.js worker_threads.\n * Returns null if the compiled worker file is not available.\n */\nexport function createNodeWorkerCompute(): ParallelComputeFn | null {\n let workerPath: string;\n try {\n workerPath = resolveWorkerPath();\n } catch {\n return null;\n }\n\n return async (\n strings: string[],\n ngramSize: number,\n numPermutations: number,\n hashA: Uint32Array,\n hashB: Uint32Array,\n numWorkers: number,\n batchSize: number\n ): Promise<Uint32Array> => {\n const n = strings.length;\n const signatures = new Uint32Array(n * numPermutations);\n\n const batches: Array<{ start: number; end: number }> = [];\n for (let i = 0; i < n; i += batchSize) {\n batches.push({ start: i, end: Math.min(i + batchSize, n) });\n }\n\n const actualWorkers = Math.min(numWorkers, batches.length);\n let batchIndex = 0;\n\n const processBatch = (): Promise<void> => {\n return new Promise((resolve, reject) => {\n const processNext = (): void => {\n if (batchIndex >= batches.length) {\n resolve();\n return;\n }\n\n const batch = batches[batchIndex++];\n const batchStrings = strings.slice(batch.start, batch.end);\n\n const worker = new Worker(workerPath, {\n workerData: {\n strings: batchStrings,\n startIndex: batch.start,\n ngramSize,\n numPermutations,\n hashA: Array.from(hashA),\n hashB: Array.from(hashB),\n },\n });\n\n worker.on(\n \"message\",\n (result: {\n signatures: number[];\n startIndex: number;\n count: number;\n }) => {\n const resultSigs = new Uint32Array(result.signatures);\n const offset = result.startIndex * numPermutations;\n signatures.set(resultSigs, offset);\n worker.terminate().then(processNext).catch(reject);\n }\n );\n\n worker.on(\"error\", reject);\n };\n\n processNext();\n });\n };\n\n const workerPromises: Promise<void>[] = [];\n for (let i = 0; i < actualWorkers; i++) {\n workerPromises.push(processBatch());\n }\n\n await Promise.all(workerPromises);\n return signatures;\n };\n}\n","import { cpus } from \"node:os\";\nimport {\n type DuplicateFinderOptions,\n type DuplicatePair,\n type FindDuplicatesResult,\n type ResolvedOptions,\n resolveOptions,\n executePipeline,\n querySignatures,\n generateHashCoefficients,\n computeAllSignatures,\n} from \"@mattgrill/nearline-core\";\nimport { createNodeWorkerCompute } from \"./workers/pool\";\n\n/**\n * Find near-duplicate strings in a dataset (Node.js-optimized).\n * Uses worker_threads for parallel computation above 5000 strings.\n */\nexport async function findDuplicates(\n strings: string[],\n opts?: DuplicateFinderOptions\n): Promise<FindDuplicatesResult> {\n const defaultWorkers = Math.max(1, cpus().length - 1);\n const options = resolveOptions(opts, defaultWorkers);\n const parallelCompute = createNodeWorkerCompute();\n return executePipeline(strings, options, parallelCompute);\n}\n\n/**\n * Class API for incremental use and querying against a built index (Node.js-optimized).\n */\nexport class DuplicateFinder {\n private options: ResolvedOptions;\n private strings: string[] = [];\n private signatures: Uint32Array | null = null;\n private hashA: Uint32Array;\n private hashB: Uint32Array;\n\n constructor(opts?: DuplicateFinderOptions) {\n const defaultWorkers = Math.max(1, cpus().length - 1);\n this.options = resolveOptions(opts, defaultWorkers);\n const { hashA, hashB } = generateHashCoefficients(\n this.options.numPermutations,\n this.options.seed\n );\n this.hashA = hashA;\n this.hashB = hashB;\n }\n\n addStrings(strings: string[]): void {\n this.strings.push(...strings);\n this.signatures = null;\n }\n\n async buildIndex(): Promise<void> {\n const parallelCompute = createNodeWorkerCompute();\n\n if (parallelCompute && this.options.workers > 0 && this.strings.length >= 5000) {\n this.signatures = await parallelCompute(\n this.strings,\n this.options.ngramSize,\n this.options.numPermutations,\n this.hashA,\n this.hashB,\n this.options.workers,\n this.options.batchSize\n );\n } else {\n this.signatures = computeAllSignatures(\n this.strings,\n this.options.ngramSize,\n this.options.numPermutations,\n this.hashA,\n this.hashB,\n this.options.preprocess\n );\n }\n }\n\n query(str: string): DuplicatePair[] {\n if (!this.signatures) {\n throw new Error(\"Index not built. Call buildIndex() first.\");\n }\n return querySignatures(\n str,\n this.options,\n this.signatures,\n this.hashA,\n this.hashB,\n this.strings.length\n );\n }\n}\n"],"names":["e","Object","Symbol","createNodeWorkerCompute","workerPath","resolveWorkerPath","currentDir","dirname","fileURLToPath","__dirname","join","strings","ngramSize","numPermutations","hashA","hashB","numWorkers","batchSize","n","signatures","Uint32Array","batches","i","Math","actualWorkers","batchIndex","processBatch","Promise","resolve","reject","processNext","batch","batchStrings","worker","Worker","Array","result","resultSigs","offset","workerPromises","findDuplicates","opts","defaultWorkers","cpus","options","resolveOptions","parallelCompute","executePipeline","DuplicateFinder","generateHashCoefficients","computeAllSignatures","str","Error","querySignatures"],"mappings":"2BAAA,GAAoB,CAAC,CAAG,CAACA,EAAS,KACjC,IAAI,IAAI,KAAO,EACL,EAAoB,CAAC,CAAC,EAAY,IAAQ,CAAC,EAAoB,CAAC,CAACA,EAAS,IACzEC,OAAO,cAAc,CAACD,EAAS,EAAK,CAAE,WAAY,GAAM,IAAK,CAAU,CAAC,EAAI,AAAC,EAGzF,ECNA,EAAoB,CAAC,CAAG,CAAC,EAAK,IAAUC,OAAO,SAAS,CAAC,cAAc,CAAC,IAAI,CAAC,EAAK,GCClF,EAAoB,CAAC,CAAG,AAAC,IACrB,AAAkB,IAAlB,OAAOC,QAA0BA,OAAO,WAAW,EACrDD,OAAO,cAAc,CAAC,EAASC,OAAO,WAAW,CAAE,CAAE,MAAO,QAAS,GAEtED,OAAO,cAAc,CAAC,EAAS,aAAc,CAAE,MAAO,EAAK,EAC5D,E,iNCaO,SAASE,IACd,IAAIC,EACJ,GAAI,CACFA,EAAaC,AAjBjB,WACE,IAAIC,EACJ,GAAI,CACFA,EAAaC,AAAAA,GAAAA,EAAAA,OAAAA,AAAAA,EAAQC,AAAAA,GAAAA,EAAAA,aAAAA,AAAAA,EAAc,gGACrC,CAAE,KAAM,CACNF,EAAaG,SACf,CACA,MAAOC,AAAAA,GAAAA,EAAAA,IAAAA,AAAAA,EAAKJ,EAAY,qBAC1B,GAUE,CAAE,KAAM,CACN,OAAO,IACT,CAEA,OAAO,MACLK,EACAC,EACAC,EACAC,EACAC,EACAC,EACAC,KAEA,IAAMC,EAAIP,EAAQ,MAAM,CAClBQ,EAAa,IAAIC,YAAYF,EAAIL,GAEjCQ,EAAiD,EAAE,CACzD,IAAK,IAAIC,EAAI,EAAGA,EAAIJ,EAAGI,GAAKL,EAC1BI,EAAQ,IAAI,CAAC,CAAE,MAAOC,EAAG,IAAKC,KAAK,GAAG,CAACD,EAAIL,EAAWC,EAAG,GAG3D,IAAMM,EAAgBD,KAAK,GAAG,CAACP,EAAYK,EAAQ,MAAM,EACrDI,EAAa,EAEXC,EAAe,IACZ,IAAIC,QAAQ,CAACC,EAASC,KAC3B,IAAMC,EAAc,KAClB,GAAIL,GAAcJ,EAAQ,MAAM,CAAE,YAChCO,IAIF,IAAMG,EAAQV,CAAO,CAACI,IAAa,CAC7BO,EAAerB,EAAQ,KAAK,CAACoB,EAAM,KAAK,CAAEA,EAAM,GAAG,EAEnDE,EAAS,IAAIC,EAAAA,MAAMA,CAAC9B,EAAY,CACpC,WAAY,CACV,QAAS4B,EACT,WAAYD,EAAM,KAAK,CACvBnB,UAAAA,EACAC,gBAAAA,EACA,MAAOsB,MAAM,IAAI,CAACrB,GAClB,MAAOqB,MAAM,IAAI,CAACpB,EACpB,CACF,GAEAkB,EAAO,EAAE,CACP,UACA,AAACG,IAKC,IAAMC,EAAa,IAAIjB,YAAYgB,EAAO,UAAU,EAC9CE,EAASF,EAAO,UAAU,CAAGvB,EACnCM,EAAW,GAAG,CAACkB,EAAYC,GAC3BL,EAAO,SAAS,GAAG,IAAI,CAACH,GAAa,KAAK,CAACD,EAC7C,GAGFI,EAAO,EAAE,CAAC,QAASJ,EACrB,EAEAC,GACF,GAGIS,EAAkC,EAAE,CAC1C,IAAK,IAAIjB,EAAI,EAAGA,EAAIE,EAAeF,IACjCiB,EAAe,IAAI,CAACb,KAItB,OADA,MAAMC,QAAQ,GAAG,CAACY,GACXpB,CACT,CACF,CChFO,eAAeqB,EACpB7B,CAAiB,CACjB8B,CAA6B,EAE7B,IAAMC,EAAiBnB,KAAK,GAAG,CAAC,EAAGoB,AAAAA,GAAAA,EAAAA,IAAAA,AAAAA,IAAO,MAAM,CAAG,GAC7CC,EAAUC,AAAAA,GAAAA,EAAAA,cAAAA,AAAAA,EAAeJ,EAAMC,GAC/BI,EAAkB3C,IACxB,MAAO4C,AAAAA,GAAAA,EAAAA,eAAAA,AAAAA,EAAgBpC,EAASiC,EAASE,EAC3C,CAKO,MAAME,EACH,OAAyB,AACzB,SAAoB,EAAE,AAAC,AACvB,YAAiC,IAAK,AACtC,MAAmB,AACnB,MAAmB,AAE3B,aAAYP,CAA6B,CAAE,CACzC,MAAMC,EAAiBnB,KAAK,GAAG,CAAC,EAAGoB,AAAAA,GAAAA,EAAAA,IAAAA,AAAAA,IAAO,MAAM,CAAG,EACnD,KAAI,CAAC,OAAO,CAAGE,AAAAA,GAAAA,EAAAA,cAAAA,AAAAA,EAAeJ,EAAMC,GACpC,KAAM,CAAE5B,MAAAA,CAAK,CAAEC,MAAAA,CAAK,CAAE,CAAGkC,AAAAA,GAAAA,EAAAA,wBAAAA,AAAAA,EACvB,IAAI,CAAC,OAAO,CAAC,eAAe,CAC5B,IAAI,CAAC,OAAO,CAAC,IAAI,CAEnB,KAAI,CAAC,KAAK,CAAGnC,EACb,IAAI,CAAC,KAAK,CAAGC,CACf,CAEA,WAAWJ,CAAiB,CAAQ,CAClC,IAAI,CAAC,OAAO,CAAC,IAAI,IAAIA,GACrB,IAAI,CAAC,UAAU,CAAG,IACpB,CAEA,MAAM,YAA4B,CAChC,IAAMmC,EAAkB3C,GAEpB2C,CAAAA,GAAmB,IAAI,CAAC,OAAO,CAAC,OAAO,CAAG,GAAK,IAAI,CAAC,OAAO,CAAC,MAAM,EAAI,IACxE,IAAI,CAAC,UAAU,CAAG,MAAMA,EACtB,IAAI,CAAC,OAAO,CACZ,IAAI,CAAC,OAAO,CAAC,SAAS,CACtB,IAAI,CAAC,OAAO,CAAC,eAAe,CAC5B,IAAI,CAAC,KAAK,CACV,IAAI,CAAC,KAAK,CACV,IAAI,CAAC,OAAO,CAAC,OAAO,CACpB,IAAI,CAAC,OAAO,CAAC,SAAS,EAGxB,IAAI,CAAC,UAAU,CAAGI,AAAAA,GAAAA,EAAAA,oBAAAA,AAAAA,EAChB,IAAI,CAAC,OAAO,CACZ,IAAI,CAAC,OAAO,CAAC,SAAS,CACtB,IAAI,CAAC,OAAO,CAAC,eAAe,CAC5B,IAAI,CAAC,KAAK,CACV,IAAI,CAAC,KAAK,CACV,IAAI,CAAC,OAAO,CAAC,UAAU,CAG7B,CAEA,MAAMC,CAAW,CAAmB,CAClC,GAAI,CAAC,IAAI,CAAC,UAAU,CAClB,MAAM,AAAIC,MAAM,6CAElB,MAAOC,AAAAA,GAAAA,EAAAA,eAAAA,AAAAA,EACLF,EACA,IAAI,CAAC,OAAO,CACZ,IAAI,CAAC,UAAU,CACf,IAAI,CAAC,KAAK,CACV,IAAI,CAAC,KAAK,CACV,IAAI,CAAC,OAAO,CAAC,MAAM,CAEvB,CACF,C"}
|
|
1
|
+
{"version":3,"file":"index.cjs","sources":["webpack://@mattgrill/nearline-node/webpack/runtime/define_property_getters","webpack://@mattgrill/nearline-node/webpack/runtime/has_own_property","webpack://@mattgrill/nearline-node/webpack/runtime/make_namespace_object","webpack://@mattgrill/nearline-node/./src/workers/pool.ts","webpack://@mattgrill/nearline-node/./src/finder.ts"],"sourcesContent":["__webpack_require__.d = (exports, definition) => {\n\tfor(var key in definition) {\n if(__webpack_require__.o(definition, key) && !__webpack_require__.o(exports, key)) {\n Object.defineProperty(exports, key, { enumerable: true, get: definition[key] });\n }\n }\n};","__webpack_require__.o = (obj, prop) => (Object.prototype.hasOwnProperty.call(obj, prop))","// define __esModule on exports\n__webpack_require__.r = (exports) => {\n\tif(typeof Symbol !== 'undefined' && Symbol.toStringTag) {\n\t\tObject.defineProperty(exports, Symbol.toStringTag, { value: 'Module' });\n\t}\n\tObject.defineProperty(exports, '__esModule', { value: true });\n};","import { Worker } from \"node:worker_threads\";\nimport { join, dirname } from \"node:path\";\nimport { fileURLToPath } from \"node:url\";\nimport type { ParallelComputeFn } from \"@mattgrill/nearline-core\";\n\nfunction resolveWorkerPath(): string {\n let currentDir: string;\n try {\n currentDir = dirname(fileURLToPath(import.meta.url));\n } catch {\n currentDir = __dirname;\n }\n return join(currentDir, \"minhash-worker.cjs\");\n}\n\n/**\n * Create a parallel compute function using Node.js worker_threads.\n * Returns null if the compiled worker file is not available.\n */\nexport function createNodeWorkerCompute(): ParallelComputeFn | null {\n let workerPath: string;\n try {\n workerPath = resolveWorkerPath();\n } catch {\n return null;\n }\n\n return async (\n strings: string[],\n ngramSize: number,\n numPermutations: number,\n hashA: Uint32Array,\n hashB: Uint32Array,\n numWorkers: number,\n batchSize: number\n ): Promise<Uint32Array> => {\n const n = strings.length;\n const signatures = new Uint32Array(n * numPermutations);\n\n const batches: Array<{ start: number; end: number }> = [];\n for (let i = 0; i < n; i += batchSize) {\n batches.push({ start: i, end: Math.min(i + batchSize, n) });\n }\n\n const actualWorkers = Math.min(numWorkers, batches.length);\n let batchIndex = 0;\n\n const processBatch = (): Promise<void> => {\n return new Promise((resolve, reject) => {\n const processNext = (): void => {\n if (batchIndex >= batches.length) {\n resolve();\n return;\n }\n\n const batch = batches[batchIndex++];\n const batchStrings = strings.slice(batch.start, batch.end);\n\n const worker = new Worker(workerPath, {\n workerData: {\n strings: batchStrings,\n startIndex: batch.start,\n ngramSize,\n numPermutations,\n hashA: Array.from(hashA),\n hashB: Array.from(hashB),\n },\n });\n\n worker.on(\n \"message\",\n (result: {\n signatures: number[];\n startIndex: number;\n count: number;\n }) => {\n const resultSigs = new Uint32Array(result.signatures);\n const offset = result.startIndex * numPermutations;\n signatures.set(resultSigs, offset);\n worker.terminate().then(processNext).catch(reject);\n }\n );\n\n worker.on(\"error\", reject);\n };\n\n processNext();\n });\n };\n\n const workerPromises: Promise<void>[] = [];\n for (let i = 0; i < actualWorkers; i++) {\n workerPromises.push(processBatch());\n }\n\n await Promise.all(workerPromises);\n return signatures;\n };\n}\n","import { cpus } from \"node:os\";\nimport {\n type DuplicateFinderOptions,\n type DuplicatePair,\n type FindDuplicatesResult,\n type ResolvedOptions,\n resolveOptions,\n executePipeline,\n querySignatures,\n generateHashCoefficients,\n computeAllSignatures,\n} from \"@mattgrill/nearline-core\";\nimport { createNodeWorkerCompute } from \"./workers/pool\";\n\n/**\n * Find near-duplicate strings in a dataset (Node.js-optimized).\n * Uses worker_threads for parallel computation above 5000 strings.\n */\nexport async function findDuplicates(\n strings: string[],\n opts?: DuplicateFinderOptions\n): Promise<FindDuplicatesResult> {\n const defaultWorkers = Math.max(1, cpus().length - 1);\n const options = resolveOptions(opts, defaultWorkers);\n const parallelCompute = createNodeWorkerCompute();\n return executePipeline(strings, options, parallelCompute);\n}\n\n/**\n * Class API for incremental use and querying against a built index (Node.js-optimized).\n */\nexport class DuplicateFinder {\n private options: ResolvedOptions;\n private strings: string[] = [];\n private signatures: Uint32Array | null = null;\n private hashA: Uint32Array;\n private hashB: Uint32Array;\n\n constructor(opts?: DuplicateFinderOptions) {\n const defaultWorkers = Math.max(1, cpus().length - 1);\n this.options = resolveOptions(opts, defaultWorkers);\n const { hashA, hashB } = generateHashCoefficients(\n this.options.numPermutations,\n this.options.seed\n );\n this.hashA = hashA;\n this.hashB = hashB;\n }\n\n addStrings(strings: string[]): void {\n this.strings.push(...strings);\n this.signatures = null;\n }\n\n async buildIndex(): Promise<void> {\n const parallelCompute = createNodeWorkerCompute();\n\n if (parallelCompute && this.options.workers > 0 && this.strings.length >= 5000) {\n this.signatures = await parallelCompute(\n this.strings,\n this.options.ngramSize,\n this.options.numPermutations,\n this.hashA,\n this.hashB,\n this.options.workers,\n this.options.batchSize\n );\n } else {\n this.signatures = computeAllSignatures(\n this.strings,\n this.options.ngramSize,\n this.options.numPermutations,\n this.hashA,\n this.hashB,\n this.options.preprocess\n );\n }\n }\n\n query(str: string): DuplicatePair[] {\n if (!this.signatures) {\n throw new Error(\"Index not built. Call buildIndex() first.\");\n }\n return querySignatures(\n str,\n this.options,\n this.signatures,\n this.hashA,\n this.hashB,\n this.strings.length\n );\n }\n}\n"],"names":["e","Object","Symbol","createNodeWorkerCompute","workerPath","resolveWorkerPath","currentDir","dirname","fileURLToPath","__dirname","join","strings","ngramSize","numPermutations","hashA","hashB","numWorkers","batchSize","n","signatures","Uint32Array","batches","i","Math","actualWorkers","batchIndex","processBatch","Promise","resolve","reject","processNext","batch","batchStrings","worker","Worker","Array","result","resultSigs","offset","workerPromises","findDuplicates","opts","defaultWorkers","cpus","options","resolveOptions","parallelCompute","executePipeline","DuplicateFinder","generateHashCoefficients","computeAllSignatures","str","Error","querySignatures"],"mappings":"2BAAA,GAAoB,CAAC,CAAG,CAACA,EAAS,KACjC,IAAI,IAAI,KAAO,EACL,EAAoB,CAAC,CAAC,EAAY,IAAQ,CAAC,EAAoB,CAAC,CAACA,EAAS,IACzEC,OAAO,cAAc,CAACD,EAAS,EAAK,CAAE,WAAY,GAAM,IAAK,CAAU,CAAC,EAAI,AAAC,EAGzF,ECNA,EAAoB,CAAC,CAAG,CAAC,EAAK,IAAUC,OAAO,SAAS,CAAC,cAAc,CAAC,IAAI,CAAC,EAAK,GCClF,EAAoB,CAAC,CAAG,AAAC,IACrB,AAAkB,IAAlB,OAAOC,QAA0BA,OAAO,WAAW,EACrDD,OAAO,cAAc,CAAC,EAASC,OAAO,WAAW,CAAE,CAAE,MAAO,QAAS,GAEtED,OAAO,cAAc,CAAC,EAAS,aAAc,CAAE,MAAO,EAAK,EAC5D,E,iNCaO,SAASE,IACd,IAAIC,EACJ,GAAI,CACFA,EAAaC,AAjBjB,WACE,IAAIC,EACJ,GAAI,CACFA,EAAaC,AAAAA,GAAAA,EAAAA,OAAAA,AAAAA,EAAQC,AAAAA,GAAAA,EAAAA,aAAAA,AAAAA,EAAc,kFACrC,CAAE,KAAM,CACNF,EAAaG,SACf,CACA,MAAOC,AAAAA,GAAAA,EAAAA,IAAAA,AAAAA,EAAKJ,EAAY,qBAC1B,GAUE,CAAE,KAAM,CACN,OAAO,IACT,CAEA,OAAO,MACLK,EACAC,EACAC,EACAC,EACAC,EACAC,EACAC,KAEA,IAAMC,EAAIP,EAAQ,MAAM,CAClBQ,EAAa,IAAIC,YAAYF,EAAIL,GAEjCQ,EAAiD,EAAE,CACzD,IAAK,IAAIC,EAAI,EAAGA,EAAIJ,EAAGI,GAAKL,EAC1BI,EAAQ,IAAI,CAAC,CAAE,MAAOC,EAAG,IAAKC,KAAK,GAAG,CAACD,EAAIL,EAAWC,EAAG,GAG3D,IAAMM,EAAgBD,KAAK,GAAG,CAACP,EAAYK,EAAQ,MAAM,EACrDI,EAAa,EAEXC,EAAe,IACZ,IAAIC,QAAQ,CAACC,EAASC,KAC3B,IAAMC,EAAc,KAClB,GAAIL,GAAcJ,EAAQ,MAAM,CAAE,YAChCO,IAIF,IAAMG,EAAQV,CAAO,CAACI,IAAa,CAC7BO,EAAerB,EAAQ,KAAK,CAACoB,EAAM,KAAK,CAAEA,EAAM,GAAG,EAEnDE,EAAS,IAAIC,EAAAA,MAAMA,CAAC9B,EAAY,CACpC,WAAY,CACV,QAAS4B,EACT,WAAYD,EAAM,KAAK,CACvBnB,UAAAA,EACAC,gBAAAA,EACA,MAAOsB,MAAM,IAAI,CAACrB,GAClB,MAAOqB,MAAM,IAAI,CAACpB,EACpB,CACF,GAEAkB,EAAO,EAAE,CACP,UACA,AAACG,IAKC,IAAMC,EAAa,IAAIjB,YAAYgB,EAAO,UAAU,EAC9CE,EAASF,EAAO,UAAU,CAAGvB,EACnCM,EAAW,GAAG,CAACkB,EAAYC,GAC3BL,EAAO,SAAS,GAAG,IAAI,CAACH,GAAa,KAAK,CAACD,EAC7C,GAGFI,EAAO,EAAE,CAAC,QAASJ,EACrB,EAEAC,GACF,GAGIS,EAAkC,EAAE,CAC1C,IAAK,IAAIjB,EAAI,EAAGA,EAAIE,EAAeF,IACjCiB,EAAe,IAAI,CAACb,KAItB,OADA,MAAMC,QAAQ,GAAG,CAACY,GACXpB,CACT,CACF,CChFO,eAAeqB,EACpB7B,CAAiB,CACjB8B,CAA6B,EAE7B,IAAMC,EAAiBnB,KAAK,GAAG,CAAC,EAAGoB,AAAAA,GAAAA,EAAAA,IAAAA,AAAAA,IAAO,MAAM,CAAG,GAC7CC,EAAUC,AAAAA,GAAAA,EAAAA,cAAAA,AAAAA,EAAeJ,EAAMC,GAC/BI,EAAkB3C,IACxB,MAAO4C,AAAAA,GAAAA,EAAAA,eAAAA,AAAAA,EAAgBpC,EAASiC,EAASE,EAC3C,CAKO,MAAME,EACH,OAAyB,AACzB,SAAoB,EAAE,AAAC,AACvB,YAAiC,IAAK,AACtC,MAAmB,AACnB,MAAmB,AAE3B,aAAYP,CAA6B,CAAE,CACzC,MAAMC,EAAiBnB,KAAK,GAAG,CAAC,EAAGoB,AAAAA,GAAAA,EAAAA,IAAAA,AAAAA,IAAO,MAAM,CAAG,EACnD,KAAI,CAAC,OAAO,CAAGE,AAAAA,GAAAA,EAAAA,cAAAA,AAAAA,EAAeJ,EAAMC,GACpC,KAAM,CAAE5B,MAAAA,CAAK,CAAEC,MAAAA,CAAK,CAAE,CAAGkC,AAAAA,GAAAA,EAAAA,wBAAAA,AAAAA,EACvB,IAAI,CAAC,OAAO,CAAC,eAAe,CAC5B,IAAI,CAAC,OAAO,CAAC,IAAI,CAEnB,KAAI,CAAC,KAAK,CAAGnC,EACb,IAAI,CAAC,KAAK,CAAGC,CACf,CAEA,WAAWJ,CAAiB,CAAQ,CAClC,IAAI,CAAC,OAAO,CAAC,IAAI,IAAIA,GACrB,IAAI,CAAC,UAAU,CAAG,IACpB,CAEA,MAAM,YAA4B,CAChC,IAAMmC,EAAkB3C,GAEpB2C,CAAAA,GAAmB,IAAI,CAAC,OAAO,CAAC,OAAO,CAAG,GAAK,IAAI,CAAC,OAAO,CAAC,MAAM,EAAI,IACxE,IAAI,CAAC,UAAU,CAAG,MAAMA,EACtB,IAAI,CAAC,OAAO,CACZ,IAAI,CAAC,OAAO,CAAC,SAAS,CACtB,IAAI,CAAC,OAAO,CAAC,eAAe,CAC5B,IAAI,CAAC,KAAK,CACV,IAAI,CAAC,KAAK,CACV,IAAI,CAAC,OAAO,CAAC,OAAO,CACpB,IAAI,CAAC,OAAO,CAAC,SAAS,EAGxB,IAAI,CAAC,UAAU,CAAGI,AAAAA,GAAAA,EAAAA,oBAAAA,AAAAA,EAChB,IAAI,CAAC,OAAO,CACZ,IAAI,CAAC,OAAO,CAAC,SAAS,CACtB,IAAI,CAAC,OAAO,CAAC,eAAe,CAC5B,IAAI,CAAC,KAAK,CACV,IAAI,CAAC,KAAK,CACV,IAAI,CAAC,OAAO,CAAC,UAAU,CAG7B,CAEA,MAAMC,CAAW,CAAmB,CAClC,GAAI,CAAC,IAAI,CAAC,UAAU,CAClB,MAAM,AAAIC,MAAM,6CAElB,MAAOC,AAAAA,GAAAA,EAAAA,eAAAA,AAAAA,EACLF,EACA,IAAI,CAAC,OAAO,CACZ,IAAI,CAAC,UAAU,CACf,IAAI,CAAC,KAAK,CACV,IAAI,CAAC,KAAK,CACV,IAAI,CAAC,OAAO,CAAC,MAAM,CAEvB,CACF,C"}
|
package/dist/index.js
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
import{fileURLToPath as t,fileURLToPath as s}from"node:url";import{dirname as r,dirname as e,join as i}from"node:path";import{cpus as n}from"node:os";import{computeAllSignatures as a,executePipeline as o,generateHashCoefficients as h,querySignatures as l,resolveOptions as
|
|
1
|
+
import{fileURLToPath as t,fileURLToPath as s}from"node:url";import{dirname as r,dirname as e,join as i}from"node:path";import{cpus as n}from"node:os";import{computeAllSignatures as a,executePipeline as o,generateHashCoefficients as h,querySignatures as l,resolveOptions as u}from"@mattgrill/nearline-core";import{Worker as p}from"node:worker_threads";var m={};m.d=(t,s)=>{for(var r in s)m.o(s,r)&&!m.o(t,r)&&Object.defineProperty(t,r,{enumerable:!0,get:s[r]})},m.o=(t,s)=>Object.prototype.hasOwnProperty.call(t,s);var c={};m.d(c,{G:()=>w,I:()=>f});var g=r(t(import.meta.url));function d(){let t;try{t=function(){let t;try{t=e(s("file:///home/runner/work/nearline/nearline/packages/server/src/workers/pool.ts"))}catch{t=g}return i(t,"minhash-worker.cjs")}()}catch{return null}return async(s,r,e,i,n,a,o)=>{let h=s.length,l=new Uint32Array(h*e),u=[];for(let t=0;t<h;t+=o)u.push({start:t,end:Math.min(t+o,h)});let m=Math.min(a,u.length),c=0,g=()=>new Promise((a,o)=>{let h=()=>{if(c>=u.length)return void a();let m=u[c++],g=s.slice(m.start,m.end),d=new p(t,{workerData:{strings:g,startIndex:m.start,ngramSize:r,numPermutations:e,hashA:Array.from(i),hashB:Array.from(n)}});d.on("message",t=>{let s=new Uint32Array(t.signatures),r=t.startIndex*e;l.set(s,r),d.terminate().then(h).catch(o)}),d.on("error",o)};h()}),d=[];for(let t=0;t<m;t++)d.push(g());return await Promise.all(d),l}}async function f(t,s){return o(t,u(s,Math.max(1,n().length-1)),d())}class w{options;strings=[];signatures=null;hashA;hashB;constructor(t){let s=Math.max(1,n().length-1);this.options=u(t,s);let{hashA:r,hashB:e}=h(this.options.numPermutations,this.options.seed);this.hashA=r,this.hashB=e}addStrings(t){this.strings.push(...t),this.signatures=null}async buildIndex(){let t=d();t&&this.options.workers>0&&this.strings.length>=5e3?this.signatures=await t(this.strings,this.options.ngramSize,this.options.numPermutations,this.hashA,this.hashB,this.options.workers,this.options.batchSize):this.signatures=a(this.strings,this.options.ngramSize,this.options.numPermutations,this.hashA,this.hashB,this.options.preprocess)}query(t){if(!this.signatures)throw Error("Index not built. Call buildIndex() first.");return l(t,this.options,this.signatures,this.hashA,this.hashB,this.strings.length)}}var y=c.G,_=c.I;export{y as DuplicateFinder,_ as findDuplicates};
|
|
2
2
|
//# sourceMappingURL=index.js.map
|
package/dist/index.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.js","sources":["webpack://@mattgrill/nearline-node/webpack/runtime/define_property_getters","webpack://@mattgrill/nearline-node/webpack/runtime/has_own_property","webpack://@mattgrill/nearline-node/./src/workers/pool.ts","webpack://@mattgrill/nearline-node/./src/finder.ts"],"sourcesContent":["__webpack_require__.d = (exports, definition) => {\n\tfor(var key in definition) {\n if(__webpack_require__.o(definition, key) && !__webpack_require__.o(exports, key)) {\n Object.defineProperty(exports, key, { enumerable: true, get: definition[key] });\n }\n }\n};","__webpack_require__.o = (obj, prop) => (Object.prototype.hasOwnProperty.call(obj, prop))","import { Worker } from \"node:worker_threads\";\nimport { join, dirname } from \"node:path\";\nimport { fileURLToPath } from \"node:url\";\nimport type { ParallelComputeFn } from \"@mattgrill/nearline-core\";\n\nfunction resolveWorkerPath(): string {\n let currentDir: string;\n try {\n currentDir = dirname(fileURLToPath(import.meta.url));\n } catch {\n currentDir = __dirname;\n }\n return join(currentDir, \"minhash-worker.cjs\");\n}\n\n/**\n * Create a parallel compute function using Node.js worker_threads.\n * Returns null if the compiled worker file is not available.\n */\nexport function createNodeWorkerCompute(): ParallelComputeFn | null {\n let workerPath: string;\n try {\n workerPath = resolveWorkerPath();\n } catch {\n return null;\n }\n\n return async (\n strings: string[],\n ngramSize: number,\n numPermutations: number,\n hashA: Uint32Array,\n hashB: Uint32Array,\n numWorkers: number,\n batchSize: number\n ): Promise<Uint32Array> => {\n const n = strings.length;\n const signatures = new Uint32Array(n * numPermutations);\n\n const batches: Array<{ start: number; end: number }> = [];\n for (let i = 0; i < n; i += batchSize) {\n batches.push({ start: i, end: Math.min(i + batchSize, n) });\n }\n\n const actualWorkers = Math.min(numWorkers, batches.length);\n let batchIndex = 0;\n\n const processBatch = (): Promise<void> => {\n return new Promise((resolve, reject) => {\n const processNext = (): void => {\n if (batchIndex >= batches.length) {\n resolve();\n return;\n }\n\n const batch = batches[batchIndex++];\n const batchStrings = strings.slice(batch.start, batch.end);\n\n const worker = new Worker(workerPath, {\n workerData: {\n strings: batchStrings,\n startIndex: batch.start,\n ngramSize,\n numPermutations,\n hashA: Array.from(hashA),\n hashB: Array.from(hashB),\n },\n });\n\n worker.on(\n \"message\",\n (result: {\n signatures: number[];\n startIndex: number;\n count: number;\n }) => {\n const resultSigs = new Uint32Array(result.signatures);\n const offset = result.startIndex * numPermutations;\n signatures.set(resultSigs, offset);\n worker.terminate().then(processNext).catch(reject);\n }\n );\n\n worker.on(\"error\", reject);\n };\n\n processNext();\n });\n };\n\n const workerPromises: Promise<void>[] = [];\n for (let i = 0; i < actualWorkers; i++) {\n workerPromises.push(processBatch());\n }\n\n await Promise.all(workerPromises);\n return signatures;\n };\n}\n","import { cpus } from \"node:os\";\nimport {\n type DuplicateFinderOptions,\n type DuplicatePair,\n type FindDuplicatesResult,\n type ResolvedOptions,\n resolveOptions,\n executePipeline,\n querySignatures,\n generateHashCoefficients,\n computeAllSignatures,\n} from \"@mattgrill/nearline-core\";\nimport { createNodeWorkerCompute } from \"./workers/pool\";\n\n/**\n * Find near-duplicate strings in a dataset (Node.js-optimized).\n * Uses worker_threads for parallel computation above 5000 strings.\n */\nexport async function findDuplicates(\n strings: string[],\n opts?: DuplicateFinderOptions\n): Promise<FindDuplicatesResult> {\n const defaultWorkers = Math.max(1, cpus().length - 1);\n const options = resolveOptions(opts, defaultWorkers);\n const parallelCompute = createNodeWorkerCompute();\n return executePipeline(strings, options, parallelCompute);\n}\n\n/**\n * Class API for incremental use and querying against a built index (Node.js-optimized).\n */\nexport class DuplicateFinder {\n private options: ResolvedOptions;\n private strings: string[] = [];\n private signatures: Uint32Array | null = null;\n private hashA: Uint32Array;\n private hashB: Uint32Array;\n\n constructor(opts?: DuplicateFinderOptions) {\n const defaultWorkers = Math.max(1, cpus().length - 1);\n this.options = resolveOptions(opts, defaultWorkers);\n const { hashA, hashB } = generateHashCoefficients(\n this.options.numPermutations,\n this.options.seed\n );\n this.hashA = hashA;\n this.hashB = hashB;\n }\n\n addStrings(strings: string[]): void {\n this.strings.push(...strings);\n this.signatures = null;\n }\n\n async buildIndex(): Promise<void> {\n const parallelCompute = createNodeWorkerCompute();\n\n if (parallelCompute && this.options.workers > 0 && this.strings.length >= 5000) {\n this.signatures = await parallelCompute(\n this.strings,\n this.options.ngramSize,\n this.options.numPermutations,\n this.hashA,\n this.hashB,\n this.options.workers,\n this.options.batchSize\n );\n } else {\n this.signatures = computeAllSignatures(\n this.strings,\n this.options.ngramSize,\n this.options.numPermutations,\n this.hashA,\n this.hashB,\n this.options.preprocess\n );\n }\n }\n\n query(str: string): DuplicatePair[] {\n if (!this.signatures) {\n throw new Error(\"Index not built. Call buildIndex() first.\");\n }\n return querySignatures(\n str,\n this.options,\n this.signatures,\n this.hashA,\n this.hashB,\n this.strings.length\n );\n }\n}\n"],"names":["Object","createNodeWorkerCompute","workerPath","resolveWorkerPath","currentDir","dirname","fileURLToPath","__dirname","join","strings","ngramSize","numPermutations","hashA","hashB","numWorkers","batchSize","n","signatures","Uint32Array","batches","i","Math","actualWorkers","batchIndex","processBatch","Promise","resolve","reject","processNext","batch","batchStrings","worker","Worker","Array","result","resultSigs","offset","workerPromises","findDuplicates","opts","executePipeline","resolveOptions","cpus","DuplicateFinder","defaultWorkers","generateHashCoefficients","parallelCompute","computeAllSignatures","str","Error","querySignatures"],"mappings":"uWAAA,GAAoB,CAAC,CAAG,CAAC,EAAS,KACjC,IAAI,IAAI,KAAO,EACL,EAAoB,CAAC,CAAC,EAAY,IAAQ,CAAC,EAAoB,CAAC,CAAC,EAAS,IACzEA,OAAO,cAAc,CAAC,EAAS,EAAK,CAAE,WAAY,GAAM,IAAK,CAAU,CAAC,EAAI,AAAC,EAGzF,ECNA,EAAoB,CAAC,CAAG,CAAC,EAAK,IAAUA,OAAO,SAAS,CAAC,cAAc,CAAC,IAAI,CAAC,EAAK,G,8DCmB3E,SAASC,IACd,IAAIC,EACJ,GAAI,CACFA,EAAaC,AAjBjB,WACE,IAAIC,EACJ,GAAI,CACFA,EAAaC,EAAQC,EAAc,
|
|
1
|
+
{"version":3,"file":"index.js","sources":["webpack://@mattgrill/nearline-node/webpack/runtime/define_property_getters","webpack://@mattgrill/nearline-node/webpack/runtime/has_own_property","webpack://@mattgrill/nearline-node/./src/workers/pool.ts","webpack://@mattgrill/nearline-node/./src/finder.ts"],"sourcesContent":["__webpack_require__.d = (exports, definition) => {\n\tfor(var key in definition) {\n if(__webpack_require__.o(definition, key) && !__webpack_require__.o(exports, key)) {\n Object.defineProperty(exports, key, { enumerable: true, get: definition[key] });\n }\n }\n};","__webpack_require__.o = (obj, prop) => (Object.prototype.hasOwnProperty.call(obj, prop))","import { Worker } from \"node:worker_threads\";\nimport { join, dirname } from \"node:path\";\nimport { fileURLToPath } from \"node:url\";\nimport type { ParallelComputeFn } from \"@mattgrill/nearline-core\";\n\nfunction resolveWorkerPath(): string {\n let currentDir: string;\n try {\n currentDir = dirname(fileURLToPath(import.meta.url));\n } catch {\n currentDir = __dirname;\n }\n return join(currentDir, \"minhash-worker.cjs\");\n}\n\n/**\n * Create a parallel compute function using Node.js worker_threads.\n * Returns null if the compiled worker file is not available.\n */\nexport function createNodeWorkerCompute(): ParallelComputeFn | null {\n let workerPath: string;\n try {\n workerPath = resolveWorkerPath();\n } catch {\n return null;\n }\n\n return async (\n strings: string[],\n ngramSize: number,\n numPermutations: number,\n hashA: Uint32Array,\n hashB: Uint32Array,\n numWorkers: number,\n batchSize: number\n ): Promise<Uint32Array> => {\n const n = strings.length;\n const signatures = new Uint32Array(n * numPermutations);\n\n const batches: Array<{ start: number; end: number }> = [];\n for (let i = 0; i < n; i += batchSize) {\n batches.push({ start: i, end: Math.min(i + batchSize, n) });\n }\n\n const actualWorkers = Math.min(numWorkers, batches.length);\n let batchIndex = 0;\n\n const processBatch = (): Promise<void> => {\n return new Promise((resolve, reject) => {\n const processNext = (): void => {\n if (batchIndex >= batches.length) {\n resolve();\n return;\n }\n\n const batch = batches[batchIndex++];\n const batchStrings = strings.slice(batch.start, batch.end);\n\n const worker = new Worker(workerPath, {\n workerData: {\n strings: batchStrings,\n startIndex: batch.start,\n ngramSize,\n numPermutations,\n hashA: Array.from(hashA),\n hashB: Array.from(hashB),\n },\n });\n\n worker.on(\n \"message\",\n (result: {\n signatures: number[];\n startIndex: number;\n count: number;\n }) => {\n const resultSigs = new Uint32Array(result.signatures);\n const offset = result.startIndex * numPermutations;\n signatures.set(resultSigs, offset);\n worker.terminate().then(processNext).catch(reject);\n }\n );\n\n worker.on(\"error\", reject);\n };\n\n processNext();\n });\n };\n\n const workerPromises: Promise<void>[] = [];\n for (let i = 0; i < actualWorkers; i++) {\n workerPromises.push(processBatch());\n }\n\n await Promise.all(workerPromises);\n return signatures;\n };\n}\n","import { cpus } from \"node:os\";\nimport {\n type DuplicateFinderOptions,\n type DuplicatePair,\n type FindDuplicatesResult,\n type ResolvedOptions,\n resolveOptions,\n executePipeline,\n querySignatures,\n generateHashCoefficients,\n computeAllSignatures,\n} from \"@mattgrill/nearline-core\";\nimport { createNodeWorkerCompute } from \"./workers/pool\";\n\n/**\n * Find near-duplicate strings in a dataset (Node.js-optimized).\n * Uses worker_threads for parallel computation above 5000 strings.\n */\nexport async function findDuplicates(\n strings: string[],\n opts?: DuplicateFinderOptions\n): Promise<FindDuplicatesResult> {\n const defaultWorkers = Math.max(1, cpus().length - 1);\n const options = resolveOptions(opts, defaultWorkers);\n const parallelCompute = createNodeWorkerCompute();\n return executePipeline(strings, options, parallelCompute);\n}\n\n/**\n * Class API for incremental use and querying against a built index (Node.js-optimized).\n */\nexport class DuplicateFinder {\n private options: ResolvedOptions;\n private strings: string[] = [];\n private signatures: Uint32Array | null = null;\n private hashA: Uint32Array;\n private hashB: Uint32Array;\n\n constructor(opts?: DuplicateFinderOptions) {\n const defaultWorkers = Math.max(1, cpus().length - 1);\n this.options = resolveOptions(opts, defaultWorkers);\n const { hashA, hashB } = generateHashCoefficients(\n this.options.numPermutations,\n this.options.seed\n );\n this.hashA = hashA;\n this.hashB = hashB;\n }\n\n addStrings(strings: string[]): void {\n this.strings.push(...strings);\n this.signatures = null;\n }\n\n async buildIndex(): Promise<void> {\n const parallelCompute = createNodeWorkerCompute();\n\n if (parallelCompute && this.options.workers > 0 && this.strings.length >= 5000) {\n this.signatures = await parallelCompute(\n this.strings,\n this.options.ngramSize,\n this.options.numPermutations,\n this.hashA,\n this.hashB,\n this.options.workers,\n this.options.batchSize\n );\n } else {\n this.signatures = computeAllSignatures(\n this.strings,\n this.options.ngramSize,\n this.options.numPermutations,\n this.hashA,\n this.hashB,\n this.options.preprocess\n );\n }\n }\n\n query(str: string): DuplicatePair[] {\n if (!this.signatures) {\n throw new Error(\"Index not built. Call buildIndex() first.\");\n }\n return querySignatures(\n str,\n this.options,\n this.signatures,\n this.hashA,\n this.hashB,\n this.strings.length\n );\n }\n}\n"],"names":["Object","createNodeWorkerCompute","workerPath","resolveWorkerPath","currentDir","dirname","fileURLToPath","__dirname","join","strings","ngramSize","numPermutations","hashA","hashB","numWorkers","batchSize","n","signatures","Uint32Array","batches","i","Math","actualWorkers","batchIndex","processBatch","Promise","resolve","reject","processNext","batch","batchStrings","worker","Worker","Array","result","resultSigs","offset","workerPromises","findDuplicates","opts","executePipeline","resolveOptions","cpus","DuplicateFinder","defaultWorkers","generateHashCoefficients","parallelCompute","computeAllSignatures","str","Error","querySignatures"],"mappings":"uWAAA,GAAoB,CAAC,CAAG,CAAC,EAAS,KACjC,IAAI,IAAI,KAAO,EACL,EAAoB,CAAC,CAAC,EAAY,IAAQ,CAAC,EAAoB,CAAC,CAAC,EAAS,IACzEA,OAAO,cAAc,CAAC,EAAS,EAAK,CAAE,WAAY,GAAM,IAAK,CAAU,CAAC,EAAI,AAAC,EAGzF,ECNA,EAAoB,CAAC,CAAG,CAAC,EAAK,IAAUA,OAAO,SAAS,CAAC,cAAc,CAAC,IAAI,CAAC,EAAK,G,8DCmB3E,SAASC,IACd,IAAIC,EACJ,GAAI,CACFA,EAAaC,AAjBjB,WACE,IAAIC,EACJ,GAAI,CACFA,EAAaC,EAAQC,EAAc,kFACrC,CAAE,KAAM,CACNF,EAAaG,CACf,CACA,OAAOC,EAAKJ,EAAY,qBAC1B,GAUE,CAAE,KAAM,CACN,OAAO,IACT,CAEA,OAAO,MACLK,EACAC,EACAC,EACAC,EACAC,EACAC,EACAC,KAEA,IAAMC,EAAIP,EAAQ,MAAM,CAClBQ,EAAa,IAAIC,YAAYF,EAAIL,GAEjCQ,EAAiD,EAAE,CACzD,IAAK,IAAIC,EAAI,EAAGA,EAAIJ,EAAGI,GAAKL,EAC1BI,EAAQ,IAAI,CAAC,CAAE,MAAOC,EAAG,IAAKC,KAAK,GAAG,CAACD,EAAIL,EAAWC,EAAG,GAG3D,IAAMM,EAAgBD,KAAK,GAAG,CAACP,EAAYK,EAAQ,MAAM,EACrDI,EAAa,EAEXC,EAAe,IACZ,IAAIC,QAAQ,CAACC,EAASC,KAC3B,IAAMC,EAAc,KAClB,GAAIL,GAAcJ,EAAQ,MAAM,CAAE,YAChCO,IAIF,IAAMG,EAAQV,CAAO,CAACI,IAAa,CAC7BO,EAAerB,EAAQ,KAAK,CAACoB,EAAM,KAAK,CAAEA,EAAM,GAAG,EAEnDE,EAAS,IAAIC,EAAO9B,EAAY,CACpC,WAAY,CACV,QAAS4B,EACT,WAAYD,EAAM,KAAK,CACvBnB,UAAAA,EACAC,gBAAAA,EACA,MAAOsB,MAAM,IAAI,CAACrB,GAClB,MAAOqB,MAAM,IAAI,CAACpB,EACpB,CACF,GAEAkB,EAAO,EAAE,CACP,UACA,AAACG,IAKC,IAAMC,EAAa,IAAIjB,YAAYgB,EAAO,UAAU,EAC9CE,EAASF,EAAO,UAAU,CAAGvB,EACnCM,EAAW,GAAG,CAACkB,EAAYC,GAC3BL,EAAO,SAAS,GAAG,IAAI,CAACH,GAAa,KAAK,CAACD,EAC7C,GAGFI,EAAO,EAAE,CAAC,QAASJ,EACrB,EAEAC,GACF,GAGIS,EAAkC,EAAE,CAC1C,IAAK,IAAIjB,EAAI,EAAGA,EAAIE,EAAeF,IACjCiB,EAAe,IAAI,CAACb,KAItB,OADA,MAAMC,QAAQ,GAAG,CAACY,GACXpB,CACT,CACF,CChFO,eAAeqB,EACpB7B,CAAiB,CACjB8B,CAA6B,EAK7B,OAAOC,EAAgB/B,EAFPgC,EAAeF,EADRlB,KAAK,GAAG,CAAC,EAAGqB,IAAO,MAAM,CAAG,IAE3BzC,IAE1B,CAKO,MAAM0C,EACH,OAAyB,AACzB,SAAoB,EAAE,AAAC,AACvB,YAAiC,IAAK,AACtC,MAAmB,AACnB,MAAmB,AAE3B,aAAYJ,CAA6B,CAAE,CACzC,IAAMK,EAAiBvB,KAAK,GAAG,CAAC,EAAGqB,IAAO,MAAM,CAAG,EACnD,KAAI,CAAC,OAAO,CAAGD,EAAeF,EAAMK,GACpC,GAAM,CAAEhC,MAAAA,CAAK,CAAEC,MAAAA,CAAK,CAAE,CAAGgC,EACvB,IAAI,CAAC,OAAO,CAAC,eAAe,CAC5B,IAAI,CAAC,OAAO,CAAC,IAAI,CAEnB,KAAI,CAAC,KAAK,CAAGjC,EACb,IAAI,CAAC,KAAK,CAAGC,CACf,CAEA,WAAWJ,CAAiB,CAAQ,CAClC,IAAI,CAAC,OAAO,CAAC,IAAI,IAAIA,GACrB,IAAI,CAAC,UAAU,CAAG,IACpB,CAEA,MAAM,YAA4B,CAChC,IAAMqC,EAAkB7C,GAEpB6C,CAAAA,GAAmB,IAAI,CAAC,OAAO,CAAC,OAAO,CAAG,GAAK,IAAI,CAAC,OAAO,CAAC,MAAM,EAAI,IACxE,IAAI,CAAC,UAAU,CAAG,MAAMA,EACtB,IAAI,CAAC,OAAO,CACZ,IAAI,CAAC,OAAO,CAAC,SAAS,CACtB,IAAI,CAAC,OAAO,CAAC,eAAe,CAC5B,IAAI,CAAC,KAAK,CACV,IAAI,CAAC,KAAK,CACV,IAAI,CAAC,OAAO,CAAC,OAAO,CACpB,IAAI,CAAC,OAAO,CAAC,SAAS,EAGxB,IAAI,CAAC,UAAU,CAAGC,EAChB,IAAI,CAAC,OAAO,CACZ,IAAI,CAAC,OAAO,CAAC,SAAS,CACtB,IAAI,CAAC,OAAO,CAAC,eAAe,CAC5B,IAAI,CAAC,KAAK,CACV,IAAI,CAAC,KAAK,CACV,IAAI,CAAC,OAAO,CAAC,UAAU,CAG7B,CAEA,MAAMC,CAAW,CAAmB,CAClC,GAAI,CAAC,IAAI,CAAC,UAAU,CAClB,MAAM,AAAIC,MAAM,6CAElB,OAAOC,EACLF,EACA,IAAI,CAAC,OAAO,CACZ,IAAI,CAAC,UAAU,CACf,IAAI,CAAC,KAAK,CACV,IAAI,CAAC,KAAK,CACV,IAAI,CAAC,OAAO,CAAC,MAAM,CAEvB,CACF,C"}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@mattgrill/nearline-node",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.3.0",
|
|
4
4
|
"description": "Node.js-optimized near-duplicate string detection using MinHash + LSH with worker_threads",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "./dist/index.cjs",
|
|
@@ -28,14 +28,14 @@
|
|
|
28
28
|
"node": ">=18.0.0"
|
|
29
29
|
},
|
|
30
30
|
"dependencies": {
|
|
31
|
-
"@mattgrill/nearline-core": "^1.
|
|
31
|
+
"@mattgrill/nearline-core": "^1.3.0"
|
|
32
32
|
},
|
|
33
33
|
"devDependencies": {
|
|
34
34
|
"@types/node": "^20.0.0"
|
|
35
35
|
},
|
|
36
36
|
"repository": {
|
|
37
37
|
"type": "git",
|
|
38
|
-
"url": "https://github.com/
|
|
38
|
+
"url": "https://github.com/mattgrill/nearline.git",
|
|
39
39
|
"directory": "packages/server"
|
|
40
40
|
},
|
|
41
41
|
"author": "Matthew Grill",
|