hypgrep 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +7 -0
- package/README.md +90 -0
- package/bin/cli.js +60 -0
- package/bin/inspect.js +67 -0
- package/bin/load.js +43 -0
- package/package.json +56 -0
- package/src/constants.js +8 -0
- package/src/createIndex.js +162 -0
- package/src/index.d.ts +29 -0
- package/src/index.js +6 -0
- package/src/parquetFind.js +113 -0
- package/src/parquetSearch.js +128 -0
- package/src/queryIndex.js +171 -0
- package/src/stemmer.js +78 -0
- package/src/tokenize.js +72 -0
- package/src/types.d.ts +92 -0
- package/src/utils.js +44 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
The MIT License (MIT)
|
|
2
|
+
|
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
|
4
|
+
|
|
5
|
+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
|
6
|
+
|
|
7
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
# HypGrep
|
|
2
|
+
|
|
3
|
+
[](https://opensource.org/licenses/MIT)
|
|
4
|
+

|
|
5
|
+
|
|
6
|
+
Build a compact full-text search index for a Parquet file using [`hyparquet`](https://github.com/hyparam/hyparquet) and [`hyparquet-writer`](https://github.com/hyparam/hyparquet-writer).
|
|
7
|
+
|
|
8
|
+
## Why?
|
|
9
|
+
|
|
10
|
+
Enable efficient full-text search on large Parquet datasets from any client without a server. Store your Parquet dataset on S3, generate a compact index file, and query it directly from a browser or other clients using HTTP range requests. The index tells you exactly which row blocks to fetch, so you only download the data you need.
|
|
11
|
+
|
|
12
|
+
Perfect for serverless architectures where you want to offer search capabilities without managing infrastructure.
|
|
13
|
+
|
|
14
|
+
## CLI usage
|
|
15
|
+
|
|
16
|
+
```bash
|
|
17
|
+
npx hypgrep dataset.parquet [dataset.index.parquet]
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
To install as a system-wide CLI tool:
|
|
21
|
+
|
|
22
|
+
```bash
|
|
23
|
+
npm install -g hypgrep
|
|
24
|
+
hypgrep dataset.parquet [dataset.index.parquet]
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
## Find rows in a parquet file in JavaScript
|
|
28
|
+
|
|
29
|
+
Use `parquetFind` to find rows matching a query while preserving natural row order (like Ctrl+F):
|
|
30
|
+
|
|
31
|
+
```javascript
|
|
32
|
+
import { parquetFind } from 'hypgrep'
|
|
33
|
+
|
|
34
|
+
for await (const row of parquetFind({
|
|
35
|
+
query: 'serverless',
|
|
36
|
+
url: 'https://s3.hyperparam.app/hypgrep/wiki_en.parquet',
|
|
37
|
+
})) {
|
|
38
|
+
console.log(row) // { title: '...', text: '...' }
|
|
39
|
+
}
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
## Ranked search
|
|
43
|
+
|
|
44
|
+
Use `parquetSearch` to rank results by BM25 relevance score (like a search engine):
|
|
45
|
+
|
|
46
|
+
```javascript
|
|
47
|
+
import { parquetSearch } from 'hypgrep'
|
|
48
|
+
|
|
49
|
+
for await (const row of parquetSearch({
|
|
50
|
+
query: 'serverless',
|
|
51
|
+
url: 'https://s3.hyperparam.app/hypgrep/wiki_en.parquet',
|
|
52
|
+
})) {
|
|
53
|
+
console.log(row) // highest relevance first
|
|
54
|
+
}
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
## Create an index in JavaScript
|
|
58
|
+
|
|
59
|
+
```javascript
|
|
60
|
+
import { asyncBufferFromFile } from 'hyparquet'
|
|
61
|
+
import { fileWriter } from 'hyparquet-writer'
|
|
62
|
+
import { createIndex } from 'hypgrep'
|
|
63
|
+
|
|
64
|
+
// Generate dataset.index.parquet from dataset.parquet
|
|
65
|
+
const sourceFile = await asyncBufferFromFile('dataset.parquet')
|
|
66
|
+
const indexFile = fileWriter('dataset.index.parquet')
|
|
67
|
+
await createIndex({ sourceFile, indexFile })
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
## Local parquet files
|
|
71
|
+
|
|
72
|
+
To search against local parquet files, provide an `asyncBufferFactory` that loads the file from the local filesystem:
|
|
73
|
+
|
|
74
|
+
```js
|
|
75
|
+
import { asyncBufferFromFile } from 'hyparquet'
|
|
76
|
+
import { parquetFind } from 'hypgrep'
|
|
77
|
+
|
|
78
|
+
// Loads parquet file from local filesystem
|
|
79
|
+
function asyncBufferFactory({ url }) {
|
|
80
|
+
return asyncBufferFromFile(url)
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
for await (const row of parquetFind({
|
|
84
|
+
query: 'serverless',
|
|
85
|
+
url: 'dataset.parquet',
|
|
86
|
+
asyncBufferFactory,
|
|
87
|
+
})) {
|
|
88
|
+
console.log(row)
|
|
89
|
+
}
|
|
90
|
+
```
|
package/bin/cli.js
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
import { fileWriter } from 'hyparquet-writer'
|
|
4
|
+
import { pathToFileURL } from 'node:url'
|
|
5
|
+
import { createIndex } from '../src/createIndex.js'
|
|
6
|
+
import { inspect } from './inspect.js'
|
|
7
|
+
import { loadParquet } from './load.js'
|
|
8
|
+
|
|
9
|
+
/**
|
|
10
|
+
* Command line entry point.
|
|
11
|
+
*/
|
|
12
|
+
async function main() {
|
|
13
|
+
const sourcePath = process.argv[2]
|
|
14
|
+
if (!sourcePath) {
|
|
15
|
+
console.error('Usage: npx hypgrep dataset.parquet [dataset.index.parquet]')
|
|
16
|
+
process.exit(1)
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
// Load source
|
|
20
|
+
const { file: sourceFile, metadata: sourceMetadata } = await loadParquet(sourcePath)
|
|
21
|
+
|
|
22
|
+
// Check for HypGrep file
|
|
23
|
+
if (sourceMetadata.key_value_metadata?.some(kv => kv.key === 'hypgrep.version')) {
|
|
24
|
+
console.error('Input is already a HypGrep file')
|
|
25
|
+
console.error(`HypGrep: ${sourcePath}`)
|
|
26
|
+
await inspect({ indexPath: sourcePath })
|
|
27
|
+
process.exit(1)
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
// Infer index path if needed
|
|
31
|
+
const indexPath = process.argv[3] ?? `${sourcePath.replace(/\.parquet$/i, '')}.index.parquet`
|
|
32
|
+
if (indexPath.startsWith('http://') || indexPath.startsWith('https://')) {
|
|
33
|
+
console.error('Must specify local index path to generate index for a remote source')
|
|
34
|
+
process.exit(1)
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
console.log(`Source: ${sourcePath}`)
|
|
38
|
+
console.log(`Source: ${sourceFile.byteLength.toLocaleString()} bytes`)
|
|
39
|
+
console.log(`Source: ${sourceMetadata.num_rows.toLocaleString()} rows`)
|
|
40
|
+
console.log(`Source: ${sourceMetadata.row_groups.length.toLocaleString()} rowgroups`)
|
|
41
|
+
console.log(`HypGrep: ${indexPath}`)
|
|
42
|
+
const startTime = performance.now()
|
|
43
|
+
|
|
44
|
+
// Create index
|
|
45
|
+
const indexFile = fileWriter(indexPath)
|
|
46
|
+
await createIndex({ sourceFile, sourceMetadata, indexFile })
|
|
47
|
+
|
|
48
|
+
// Print statistics
|
|
49
|
+
await inspect({ sourcePath, indexPath })
|
|
50
|
+
|
|
51
|
+
console.log()
|
|
52
|
+
console.log(`Created ${indexPath} in ${((performance.now() - startTime) / 1000).toFixed(1)} s`)
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
if (import.meta.url === pathToFileURL(process.argv[1]).href) {
|
|
56
|
+
main().catch(error => {
|
|
57
|
+
console.error(error.message)
|
|
58
|
+
process.exit(1)
|
|
59
|
+
})
|
|
60
|
+
}
|
package/bin/inspect.js
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
import { parseKvMetadata } from '../src/queryIndex.js'
|
|
2
|
+
import { loadParquet } from './load.js'
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* @import { FileMetaData } from 'hyparquet'
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
/**
|
|
9
|
+
* Print statistics about an index file.
|
|
10
|
+
*
|
|
11
|
+
* @param {object} options
|
|
12
|
+
* @param {string} options.indexPath
|
|
13
|
+
* @param {string} [options.sourcePath]
|
|
14
|
+
*/
|
|
15
|
+
export async function inspect({ indexPath, sourcePath }) {
|
|
16
|
+
// Load the index file
|
|
17
|
+
const { file: indexFile, metadata: indexMetadata } = await loadParquet(indexPath)
|
|
18
|
+
|
|
19
|
+
console.log(`HypGrep: ${indexFile.byteLength.toLocaleString()} bytes`)
|
|
20
|
+
console.log(`HypGrep: ${indexMetadata.num_rows.toLocaleString()} rows`)
|
|
21
|
+
console.log(`HypGrep: ${indexMetadata.row_groups.length.toLocaleString()} index rowgroups`)
|
|
22
|
+
|
|
23
|
+
// Calculate index stats
|
|
24
|
+
let avgIndexRowGroupSize = 0
|
|
25
|
+
for (const rowGroup of indexMetadata.row_groups) {
|
|
26
|
+
avgIndexRowGroupSize += Number(rowGroup.total_byte_size)
|
|
27
|
+
}
|
|
28
|
+
avgIndexRowGroupSize /= indexMetadata.row_groups.length || 1
|
|
29
|
+
|
|
30
|
+
// Guess source path if not provided
|
|
31
|
+
sourcePath ??= indexPath.replace(/\.index\.parquet$/i, '.parquet')
|
|
32
|
+
const sourceMetadata = await loadParquet(sourcePath)
|
|
33
|
+
.then(({ metadata }) => metadata)
|
|
34
|
+
.catch(() => undefined)
|
|
35
|
+
if (!sourceMetadata) {
|
|
36
|
+
throw new Error(`Failed to load source parquet: ${sourcePath}`)
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
// Calculate source stats
|
|
40
|
+
const { blockSize } = parseKvMetadata(indexMetadata.key_value_metadata ?? [])
|
|
41
|
+
const numBlocks = Math.ceil(Number(sourceMetadata.num_rows) / blockSize)
|
|
42
|
+
|
|
43
|
+
console.log(`HypGrep: ${sourceMetadata.row_groups.length.toLocaleString()} source rowgroups`)
|
|
44
|
+
console.log(`HypGrep: ${numBlocks.toLocaleString()} blocks`)
|
|
45
|
+
|
|
46
|
+
let avgSourceRowGroupSize = 0
|
|
47
|
+
for (const rowGroup of sourceMetadata.row_groups) {
|
|
48
|
+
avgSourceRowGroupSize += Number(rowGroup.total_byte_size)
|
|
49
|
+
}
|
|
50
|
+
avgSourceRowGroupSize /= sourceMetadata.row_groups.length || 1
|
|
51
|
+
|
|
52
|
+
// Average single-term query size
|
|
53
|
+
const indexMetadataLength = Math.round(indexMetadata.metadata_length).toLocaleString()
|
|
54
|
+
const indexAvgRowGroup = Math.round(avgIndexRowGroupSize).toLocaleString()
|
|
55
|
+
const sourceMetadataLength = Math.round(sourceMetadata.metadata_length).toLocaleString()
|
|
56
|
+
const sourceAvgRowGroup = Math.round(avgSourceRowGroupSize).toLocaleString()
|
|
57
|
+
const queryAvg = Math.round(
|
|
58
|
+
indexMetadata.metadata_length + avgIndexRowGroupSize + avgSourceRowGroupSize
|
|
59
|
+
).toLocaleString()
|
|
60
|
+
|
|
61
|
+
// Print statistics
|
|
62
|
+
console.log(`Query: ~${queryAvg} bytes`)
|
|
63
|
+
console.log(` + ${indexMetadataLength} bytes (index metadata)`)
|
|
64
|
+
console.log(` + ${indexAvgRowGroup} bytes (index rowgroup)`)
|
|
65
|
+
console.log(` + ${sourceMetadataLength} bytes (source metadata)`)
|
|
66
|
+
console.log(` + ${sourceAvgRowGroup} bytes (source rowgroup)`)
|
|
67
|
+
}
|
package/bin/load.js
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
import fs from 'node:fs/promises'
|
|
2
|
+
import { asyncBufferFromFile, asyncBufferFromUrl, parquetMetadataAsync } from 'hyparquet'
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* @import {AsyncBuffer, FileMetaData} from 'hyparquet'
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
/**
|
|
9
|
+
* @param {string} path
|
|
10
|
+
* @returns {Promise<{ file: AsyncBuffer, metadata: FileMetaData }>}
|
|
11
|
+
*/
|
|
12
|
+
export async function loadParquet(path) {
|
|
13
|
+
/** @type {AsyncBuffer | undefined} */
|
|
14
|
+
let file
|
|
15
|
+
if (path.startsWith('http://') || path.startsWith('https://')) {
|
|
16
|
+
try {
|
|
17
|
+
file = await asyncBufferFromUrl({ url: path })
|
|
18
|
+
} catch (error) {
|
|
19
|
+
console.error(`Failed to load Parquet file from URL: ${path}`)
|
|
20
|
+
throw error
|
|
21
|
+
}
|
|
22
|
+
} else {
|
|
23
|
+
try {
|
|
24
|
+
await fs.access(path)
|
|
25
|
+
} catch {
|
|
26
|
+
throw new Error(`Parquet file not found: ${path}`)
|
|
27
|
+
}
|
|
28
|
+
try {
|
|
29
|
+
file = await asyncBufferFromFile(path)
|
|
30
|
+
} catch (error) {
|
|
31
|
+
console.error(`Failed to load Parquet file from path: ${path}`)
|
|
32
|
+
throw error
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
/** @type {FileMetaData | undefined} */
|
|
36
|
+
let metadata
|
|
37
|
+
try {
|
|
38
|
+
metadata = await parquetMetadataAsync(file)
|
|
39
|
+
} catch (error) {
|
|
40
|
+
throw new Error(`Failed to read Parquet metadata from file: ${path}`, { cause: error })
|
|
41
|
+
}
|
|
42
|
+
return { file, metadata }
|
|
43
|
+
}
|
package/package.json
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "hypgrep",
|
|
3
|
+
"version": "0.1.1",
|
|
4
|
+
"author": "Hyperparam",
|
|
5
|
+
"homepage": "https://hyperparam.app",
|
|
6
|
+
"license": "MIT",
|
|
7
|
+
"keywords": [
|
|
8
|
+
"parquet",
|
|
9
|
+
"index",
|
|
10
|
+
"search",
|
|
11
|
+
"full-text-search",
|
|
12
|
+
"hyparquet",
|
|
13
|
+
"serverless"
|
|
14
|
+
],
|
|
15
|
+
"repository": {
|
|
16
|
+
"type": "git",
|
|
17
|
+
"url": "git+https://github.com/hyparam/hypgrep.git"
|
|
18
|
+
},
|
|
19
|
+
"type": "module",
|
|
20
|
+
"sideEffects": false,
|
|
21
|
+
"bin": {
|
|
22
|
+
"hypgrep": "./bin/cli.js"
|
|
23
|
+
},
|
|
24
|
+
"types": "src/index.d.ts",
|
|
25
|
+
"main": "src/index.js",
|
|
26
|
+
"export": {
|
|
27
|
+
".": {
|
|
28
|
+
"import": "./src/index.js",
|
|
29
|
+
"types": "./src/index.d.ts"
|
|
30
|
+
}
|
|
31
|
+
},
|
|
32
|
+
"files": [
|
|
33
|
+
"bin",
|
|
34
|
+
"src"
|
|
35
|
+
],
|
|
36
|
+
"scripts": {
|
|
37
|
+
"benchmark": "node benchmark.js",
|
|
38
|
+
"coverage": "vitest run --coverage --coverage.include=src",
|
|
39
|
+
"lint": "eslint",
|
|
40
|
+
"lint:fix": "eslint --fix",
|
|
41
|
+
"test": "vitest run"
|
|
42
|
+
},
|
|
43
|
+
"dependencies": {
|
|
44
|
+
"hyparquet": "1.25.8",
|
|
45
|
+
"hyparquet-compressors": "1.1.1",
|
|
46
|
+
"hyparquet-writer": "0.15.1"
|
|
47
|
+
},
|
|
48
|
+
"devDependencies": {
|
|
49
|
+
"@types/node": "25.9.1",
|
|
50
|
+
"@vitest/coverage-v8": "4.1.6",
|
|
51
|
+
"eslint": "9.39.4",
|
|
52
|
+
"eslint-plugin-jsdoc": "62.9.0",
|
|
53
|
+
"typescript": "6.0.3",
|
|
54
|
+
"vitest": "4.1.6"
|
|
55
|
+
}
|
|
56
|
+
}
|
package/src/constants.js
ADDED
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
import { parquetMetadataAsync, parquetReadObjects } from 'hyparquet'
|
|
2
|
+
import { parquetWrite } from 'hyparquet-writer'
|
|
3
|
+
import { defaultBlockSize, defaultIndexRowGroupSize, hypGrepVersion } from './constants.js'
|
|
4
|
+
import { tokenize } from './tokenize.js'
|
|
5
|
+
import { getTextColumnsFromSchema } from './utils.js'
|
|
6
|
+
|
|
7
|
+
/**
|
|
8
|
+
* @import { BlockStats, CreateIndexOptions, IndexRow } from './types.js'
|
|
9
|
+
* @import { ColumnSource } from 'hyparquet-writer'
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
/**
|
|
13
|
+
* Create a full-text search index parquet next to the given parquet file.
|
|
14
|
+
*
|
|
15
|
+
* @param {CreateIndexOptions} options
|
|
16
|
+
* @returns {Promise<void>}
|
|
17
|
+
*/
|
|
18
|
+
export async function createIndex({
|
|
19
|
+
sourceFile,
|
|
20
|
+
sourceMetadata,
|
|
21
|
+
indexFile,
|
|
22
|
+
blockSize = defaultBlockSize,
|
|
23
|
+
indexRowGroupSize = defaultIndexRowGroupSize,
|
|
24
|
+
}) {
|
|
25
|
+
const metadata = sourceMetadata ?? await parquetMetadataAsync(sourceFile)
|
|
26
|
+
const numRows = Number(metadata.num_rows)
|
|
27
|
+
const textColumns = getTextColumnsFromSchema(metadata)
|
|
28
|
+
if (textColumns.length === 0) {
|
|
29
|
+
throw new Error('No string columns found to index')
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
// Map from term -> array of entries (entries are in blockId order)
|
|
33
|
+
/** @type {Map<string, IndexRow[]>} */
|
|
34
|
+
const termIndex = new Map()
|
|
35
|
+
|
|
36
|
+
let blockId = 0
|
|
37
|
+
for (let rowStart = 0; rowStart < numRows; rowStart += blockSize) {
|
|
38
|
+
const rowEnd = Math.min(rowStart + blockSize, numRows)
|
|
39
|
+
|
|
40
|
+
const rows = await parquetReadObjects({
|
|
41
|
+
file: sourceFile,
|
|
42
|
+
metadata,
|
|
43
|
+
rowStart,
|
|
44
|
+
rowEnd,
|
|
45
|
+
columns: textColumns,
|
|
46
|
+
})
|
|
47
|
+
|
|
48
|
+
const { termDocCount, termFreqMap } = collectBlockStats(rows, textColumns)
|
|
49
|
+
|
|
50
|
+
// Build index entries for this block
|
|
51
|
+
for (const [term, docCount] of termDocCount.entries()) {
|
|
52
|
+
const termFreq = termFreqMap.get(term) || docCount
|
|
53
|
+
const entry = { term, blockId, docCount, termFreq }
|
|
54
|
+
const existing = termIndex.get(term)
|
|
55
|
+
if (existing) {
|
|
56
|
+
existing.push(entry)
|
|
57
|
+
} else {
|
|
58
|
+
termIndex.set(term, [entry])
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
blockId += 1
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
// Sort by term
|
|
66
|
+
const sortedTerms = Array.from(termIndex.keys()).sort()
|
|
67
|
+
|
|
68
|
+
// Flatten into sorted indexRows
|
|
69
|
+
/** @type {IndexRow[]} */
|
|
70
|
+
const indexRows = []
|
|
71
|
+
for (const term of sortedTerms) {
|
|
72
|
+
const entries = termIndex.get(term)
|
|
73
|
+
if (!entries) continue
|
|
74
|
+
indexRows.push(...entries)
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
const kvMetadata = [
|
|
78
|
+
{ key: 'hypgrep.version', value: String(hypGrepVersion) },
|
|
79
|
+
{ key: 'hypgrep.block_size', value: String(blockSize) },
|
|
80
|
+
{ key: 'hypgrep.text_columns', value: textColumns.join(',') },
|
|
81
|
+
{ key: 'hypgrep.source_rows', value: String(numRows) },
|
|
82
|
+
// Can save network requests on the source file
|
|
83
|
+
{ key: 'hypgrep.source_bytelength', value: String(sourceFile.byteLength) },
|
|
84
|
+
]
|
|
85
|
+
|
|
86
|
+
const columnData = buildColumnData(indexRows)
|
|
87
|
+
await parquetWrite({
|
|
88
|
+
writer: indexFile,
|
|
89
|
+
columnData,
|
|
90
|
+
rowGroupSize: indexRowGroupSize,
|
|
91
|
+
kvMetadata,
|
|
92
|
+
})
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
/**
|
|
96
|
+
* Collect term statistics for a single logical block of rows.
|
|
97
|
+
*
|
|
98
|
+
* @param {Record<string, any>[]} rows
|
|
99
|
+
* @param {string[]} textColumns
|
|
100
|
+
* @returns {BlockStats}
|
|
101
|
+
*/
|
|
102
|
+
function collectBlockStats(rows, textColumns) {
|
|
103
|
+
const termDocCount = new Map()
|
|
104
|
+
const termFreqMap = new Map()
|
|
105
|
+
|
|
106
|
+
for (const row of rows) {
|
|
107
|
+
if (!row) continue
|
|
108
|
+
|
|
109
|
+
const seenInRow = new Set()
|
|
110
|
+
|
|
111
|
+
for (const columnName of textColumns) {
|
|
112
|
+
const value = row[columnName]
|
|
113
|
+
if (typeof value !== 'string' || value.length === 0) continue
|
|
114
|
+
|
|
115
|
+
const tokens = tokenize(value)
|
|
116
|
+
|
|
117
|
+
for (const token of tokens) {
|
|
118
|
+
seenInRow.add(token)
|
|
119
|
+
const prevFreq = termFreqMap.get(token) || 0
|
|
120
|
+
termFreqMap.set(token, prevFreq + 1)
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
for (const token of seenInRow) {
|
|
125
|
+
const prevDocCount = termDocCount.get(token) || 0
|
|
126
|
+
termDocCount.set(token, prevDocCount + 1)
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
return { termDocCount, termFreqMap }
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
/**
|
|
134
|
+
* Convert row-oriented index entries to column-oriented data for hyparquet-writer.
|
|
135
|
+
*
|
|
136
|
+
* @param {IndexRow[]} indexRows
|
|
137
|
+
* @returns {ColumnSource[]}
|
|
138
|
+
*/
|
|
139
|
+
function buildColumnData(indexRows) {
|
|
140
|
+
const { length } = indexRows
|
|
141
|
+
const terms = new Array(length)
|
|
142
|
+
const blockIds = new Array(length)
|
|
143
|
+
const docCounts = new Array(length)
|
|
144
|
+
const termFreqs = new Array(length)
|
|
145
|
+
|
|
146
|
+
for (let i = 0; i < length; i += 1) {
|
|
147
|
+
const row = indexRows[i]
|
|
148
|
+
terms[i] = row.term
|
|
149
|
+
blockIds[i] = row.blockId
|
|
150
|
+
docCounts[i] = row.docCount
|
|
151
|
+
termFreqs[i] = row.termFreq
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
return [
|
|
155
|
+
// Delta byte array encoding works well for sorted string columns
|
|
156
|
+
{ name: 'term', data: terms, type: 'STRING', encoding: 'DELTA_BYTE_ARRAY' },
|
|
157
|
+
// Delta binary packed works well for incrementing integers
|
|
158
|
+
{ name: 'blockId', data: blockIds, type: 'INT32', encoding: 'DELTA_BINARY_PACKED' },
|
|
159
|
+
{ name: 'docCount', data: docCounts, type: 'INT32' },
|
|
160
|
+
{ name: 'termFreq', data: termFreqs, type: 'INT32' },
|
|
161
|
+
]
|
|
162
|
+
}
|
package/src/index.d.ts
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
import type { CreateIndexOptions, ParquetSearchOptions, QueryIndexOptions, QueryResult } from './types.js'
|
|
2
|
+
|
|
3
|
+
export const hypGrepVersion: number
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Create a full-text search index parquet next to the given parquet file.
|
|
7
|
+
*/
|
|
8
|
+
export function createIndex(options: CreateIndexOptions): Promise<void>
|
|
9
|
+
|
|
10
|
+
/**
|
|
11
|
+
* Uses hypgrep to find rows matching a query from a source parquet file.
|
|
12
|
+
*/
|
|
13
|
+
export function parquetFind(options: ParquetSearchOptions): AsyncGenerator<Record<string, any>, void, unknown>
|
|
14
|
+
|
|
15
|
+
/**
|
|
16
|
+
* Uses hypgrep to query a source parquet file and return matching rows ranked by relevance.
|
|
17
|
+
*/
|
|
18
|
+
export function parquetSearch(options: ParquetSearchOptions): AsyncGenerator<Record<string, any>, void, unknown>
|
|
19
|
+
|
|
20
|
+
/**
|
|
21
|
+
* Query a search index to find matching row groups from the source parquet.
|
|
22
|
+
*/
|
|
23
|
+
export function queryIndex(options: QueryIndexOptions): Promise<QueryResult>
|
|
24
|
+
|
|
25
|
+
/**
|
|
26
|
+
* Tokenize text into normalized terms.
|
|
27
|
+
* Lowercases and splits on non-alphanumeric boundaries.
|
|
28
|
+
*/
|
|
29
|
+
export function tokenize(text: string): string[]
|
package/src/index.js
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
export { hypGrepVersion } from './constants.js'
|
|
2
|
+
export { createIndex } from './createIndex.js'
|
|
3
|
+
export { parquetFind } from './parquetFind.js'
|
|
4
|
+
export { parquetSearch } from './parquetSearch.js'
|
|
5
|
+
export { queryIndex } from './queryIndex.js'
|
|
6
|
+
export { tokenize } from './tokenize.js'
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
import { asyncBufferFromUrl, parquetMetadataAsync, parquetReadObjects } from 'hyparquet'
|
|
2
|
+
import { queryIndex } from './queryIndex.js'
|
|
3
|
+
import { tokenize } from './tokenize.js'
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Find rows matching a query, maintaining natural row order.
|
|
7
|
+
*
|
|
8
|
+
* @import {ParquetSearchOptions, TermResults} from '../src/types.js'
|
|
9
|
+
* @param {ParquetSearchOptions} options
|
|
10
|
+
* @returns {AsyncGenerator<Record<string, any>, void, unknown>}
|
|
11
|
+
*/
|
|
12
|
+
export async function* parquetFind({
|
|
13
|
+
query,
|
|
14
|
+
url,
|
|
15
|
+
limit = Infinity,
|
|
16
|
+
prefix = true,
|
|
17
|
+
signal,
|
|
18
|
+
asyncBufferFactory = asyncBufferFromUrl,
|
|
19
|
+
sourceFile,
|
|
20
|
+
sourceMetadata,
|
|
21
|
+
indexFile,
|
|
22
|
+
indexMetadata,
|
|
23
|
+
...hyparquetOptions
|
|
24
|
+
}) {
|
|
25
|
+
if (!query || limit <= 0) return
|
|
26
|
+
signal?.throwIfAborted()
|
|
27
|
+
// Query the index to get matching blocks
|
|
28
|
+
indexFile ??= await asyncBufferFactory({ url: `${url.replace(/\.parquet$/i, '')}.index.parquet` })
|
|
29
|
+
const queryResult = await queryIndex({ query, indexFile, indexMetadata, prefix })
|
|
30
|
+
if (!queryResult) return
|
|
31
|
+
const { blocks, textColumns, sourceByteLength } = queryResult
|
|
32
|
+
|
|
33
|
+
// If no matching blocks, return empty result
|
|
34
|
+
if (blocks.length === 0) return
|
|
35
|
+
|
|
36
|
+
// Sort blocks by blockId for natural row order
|
|
37
|
+
blocks.sort((a, b) => a.blockId - b.blockId)
|
|
38
|
+
signal?.throwIfAborted()
|
|
39
|
+
|
|
40
|
+
// Construct source file if not provided, use byteLength from index metadata if available
|
|
41
|
+
const file = sourceFile ?? await asyncBufferFactory({ url, byteLength: sourceByteLength })
|
|
42
|
+
// Get source metadata once before loop only if needed
|
|
43
|
+
const metadata = sourceMetadata ?? await parquetMetadataAsync(file)
|
|
44
|
+
|
|
45
|
+
// Tokenize query terms for matching
|
|
46
|
+
const queryTerms = new Set(tokenize(query))
|
|
47
|
+
|
|
48
|
+
// For each matching block (in natural order), read rows from the source parquet
|
|
49
|
+
let count = 0
|
|
50
|
+
for (const block of blocks) {
|
|
51
|
+
signal?.throwIfAborted()
|
|
52
|
+
const blockRows = await parquetReadObjects({
|
|
53
|
+
...hyparquetOptions,
|
|
54
|
+
file,
|
|
55
|
+
metadata,
|
|
56
|
+
rowStart: block.rowStart,
|
|
57
|
+
rowEnd: block.rowEnd,
|
|
58
|
+
useOffsetIndex: true,
|
|
59
|
+
})
|
|
60
|
+
|
|
61
|
+
// Yield matching rows in natural order (no sorting)
|
|
62
|
+
for (let i = 0; i < blockRows.length; i++) {
|
|
63
|
+
const row = blockRows[i]
|
|
64
|
+
if (matchesRow(row, textColumns, queryTerms, block.terms, prefix)) {
|
|
65
|
+
yield { __index__: block.rowStart + i, ...row }
|
|
66
|
+
if (++count >= limit) return
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
/**
|
|
73
|
+
* Check if a row matches any of the query terms.
|
|
74
|
+
*
|
|
75
|
+
* @param {Record<string, any>} row
|
|
76
|
+
* @param {string[]} textColumns
|
|
77
|
+
* @param {Set<string>} queryTerms
|
|
78
|
+
* @param {TermResults} termStats
|
|
79
|
+
* @param {boolean} prefix
|
|
80
|
+
* @returns {boolean}
|
|
81
|
+
*/
|
|
82
|
+
function matchesRow(row, textColumns, queryTerms, termStats, prefix) {
|
|
83
|
+
const rowTokens = new Set()
|
|
84
|
+
|
|
85
|
+
// Collect all tokens from text columns
|
|
86
|
+
for (const col of textColumns) {
|
|
87
|
+
const value = row[col]
|
|
88
|
+
if (typeof value === 'string') {
|
|
89
|
+
for (const token of tokenize(value)) {
|
|
90
|
+
rowTokens.add(token)
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
// Check if any query term matches
|
|
96
|
+
for (const queryTerm of queryTerms) {
|
|
97
|
+
if (prefix) {
|
|
98
|
+
// Prefix matching: find row tokens that start with query term
|
|
99
|
+
for (const token of rowTokens) {
|
|
100
|
+
if (token.startsWith(queryTerm) && termStats[token]) {
|
|
101
|
+
return true
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
} else {
|
|
105
|
+
// Exact matching
|
|
106
|
+
if (rowTokens.has(queryTerm) && termStats[queryTerm]) {
|
|
107
|
+
return true
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
return false
|
|
113
|
+
}
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
import { asyncBufferFromUrl, parquetMetadataAsync, parquetReadObjects } from 'hyparquet'
|
|
2
|
+
import { queryIndex } from './queryIndex.js'
|
|
3
|
+
import { tokenize } from './tokenize.js'
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Uses the hypgrep to query a source parquet file and return matching rows.
|
|
7
|
+
*
|
|
8
|
+
* @import {ParquetSearchOptions, TermResults} from '../src/types.js'
|
|
9
|
+
* @param {ParquetSearchOptions} options
|
|
10
|
+
* @returns {AsyncGenerator<Record<string, any>, void, unknown>}
|
|
11
|
+
*/
|
|
12
|
+
export async function* parquetSearch({
|
|
13
|
+
query,
|
|
14
|
+
url,
|
|
15
|
+
limit = Infinity,
|
|
16
|
+
prefix = true,
|
|
17
|
+
signal,
|
|
18
|
+
asyncBufferFactory = asyncBufferFromUrl,
|
|
19
|
+
sourceFile,
|
|
20
|
+
sourceMetadata,
|
|
21
|
+
indexFile,
|
|
22
|
+
indexMetadata,
|
|
23
|
+
...hyparquetOptions
|
|
24
|
+
}) {
|
|
25
|
+
if (!query || limit <= 0) return
|
|
26
|
+
signal?.throwIfAborted()
|
|
27
|
+
// Query the index to get matching blocks
|
|
28
|
+
indexFile ??= await asyncBufferFactory({ url: `${url.replace(/\.parquet$/i, '')}.index.parquet` })
|
|
29
|
+
const queryResult = await queryIndex({ query, indexFile, indexMetadata, prefix })
|
|
30
|
+
if (!queryResult) return
|
|
31
|
+
const { blocks, textColumns, sourceByteLength } = queryResult
|
|
32
|
+
|
|
33
|
+
// Sort blocks by score descending (most relevant first)
|
|
34
|
+
blocks.sort((a, b) => b.score - a.score)
|
|
35
|
+
|
|
36
|
+
// If no matching blocks, return empty result
|
|
37
|
+
if (blocks.length === 0) return
|
|
38
|
+
signal?.throwIfAborted()
|
|
39
|
+
|
|
40
|
+
// Construct source file if not provided, use byteLength from index metadata if available
|
|
41
|
+
const file = sourceFile ?? await asyncBufferFactory({ url, byteLength: sourceByteLength })
|
|
42
|
+
// Get source metadata once before loop only if needed
|
|
43
|
+
const metadata = sourceMetadata ?? await parquetMetadataAsync(file)
|
|
44
|
+
|
|
45
|
+
// Tokenize query terms for matching
|
|
46
|
+
const queryTerms = new Set(tokenize(query))
|
|
47
|
+
|
|
48
|
+
// For each matching block, read rows from the source parquet
|
|
49
|
+
let count = 0
|
|
50
|
+
for (const block of blocks) {
|
|
51
|
+
signal?.throwIfAborted()
|
|
52
|
+
const blockRows = await parquetReadObjects({
|
|
53
|
+
...hyparquetOptions,
|
|
54
|
+
file,
|
|
55
|
+
metadata,
|
|
56
|
+
rowStart: block.rowStart,
|
|
57
|
+
rowEnd: block.rowEnd,
|
|
58
|
+
useOffsetIndex: true,
|
|
59
|
+
})
|
|
60
|
+
|
|
61
|
+
// Score and collect matching rows within the block
|
|
62
|
+
/** @type {{index: number, row: Record<string, any>, score: number}[]} */
|
|
63
|
+
const scoredRows = []
|
|
64
|
+
for (let i = 0; i < blockRows.length; i++) {
|
|
65
|
+
const row = blockRows[i]
|
|
66
|
+
const score = scoreRow(row, textColumns, queryTerms, block.terms, prefix)
|
|
67
|
+
if (score > 0) {
|
|
68
|
+
scoredRows.push({ index: block.rowStart + i, row, score })
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
// Sort by score descending within block
|
|
73
|
+
scoredRows.sort((a, b) => b.score - a.score)
|
|
74
|
+
|
|
75
|
+
// Yield rows in score order
|
|
76
|
+
for (const { index, row } of scoredRows) {
|
|
77
|
+
yield { __index__: index, ...row }
|
|
78
|
+
if (++count >= limit) return
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
/**
|
|
84
|
+
* Score a row based on which query terms it matches, weighted by IDF.
|
|
85
|
+
*
|
|
86
|
+
* @param {Record<string, any>} row
|
|
87
|
+
* @param {string[]} textColumns
|
|
88
|
+
* @param {Set<string>} queryTerms
|
|
89
|
+
* @param {TermResults} termStats
|
|
90
|
+
* @param {boolean} prefix
|
|
91
|
+
* @returns {number} score (0 if no match)
|
|
92
|
+
*/
|
|
93
|
+
function scoreRow(row, textColumns, queryTerms, termStats, prefix) {
|
|
94
|
+
let score = 0
|
|
95
|
+
const rowTokens = new Set()
|
|
96
|
+
|
|
97
|
+
// Collect all tokens from text columns
|
|
98
|
+
for (const col of textColumns) {
|
|
99
|
+
const value = row[col]
|
|
100
|
+
if (typeof value === 'string') {
|
|
101
|
+
for (const token of tokenize(value)) {
|
|
102
|
+
rowTokens.add(token)
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
// Score based on matching query terms weighted by IDF
|
|
108
|
+
for (const queryTerm of queryTerms) {
|
|
109
|
+
if (prefix) {
|
|
110
|
+
// Prefix matching: find row tokens that start with query term
|
|
111
|
+
for (const token of rowTokens) {
|
|
112
|
+
if (token.startsWith(queryTerm)) {
|
|
113
|
+
// Use the matched token's stats from the index
|
|
114
|
+
const stats = termStats[token]
|
|
115
|
+
score += stats?.idf ?? 1
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
} else {
|
|
119
|
+
// Exact matching
|
|
120
|
+
if (rowTokens.has(queryTerm)) {
|
|
121
|
+
const stats = termStats[queryTerm]
|
|
122
|
+
score += stats?.idf ?? 1
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
return score
|
|
128
|
+
}
|
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
import { parquetMetadataAsync, parquetQuery } from 'hyparquet'
|
|
2
|
+
import { defaultBlockSize, hypGrepVersion } from './constants.js'
|
|
3
|
+
import { tokenize } from './tokenize.js'
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* @import { FileMetaData, KeyValue, ParquetQueryFilter } from 'hyparquet'
|
|
7
|
+
* @import { BlockResult, HypGrepMetadata, QueryIndexOptions, QueryResult, TermResults } from './types.js'
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
/**
|
|
11
|
+
* Build a pushdown filter to efficiently query for terms in the index.
|
|
12
|
+
* Optionally uses prefix matching.
|
|
13
|
+
*
|
|
14
|
+
* @param {string[]} terms
|
|
15
|
+
* @param {boolean} prefix - whether to use prefix matching
|
|
16
|
+
* @returns {ParquetQueryFilter}
|
|
17
|
+
*/
|
|
18
|
+
function termsFilter(terms, prefix) {
|
|
19
|
+
if (prefix) {
|
|
20
|
+
const $or = terms.map(t => {
|
|
21
|
+
const lastChar = t.charCodeAt(t.length - 1)
|
|
22
|
+
const upperBound = t.slice(0, -1) + String.fromCharCode(lastChar + 1)
|
|
23
|
+
return { term: { $gte: t, $lt: upperBound } }
|
|
24
|
+
})
|
|
25
|
+
return { $or }
|
|
26
|
+
} else {
|
|
27
|
+
return { term: { $in: terms } }
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
/**
|
|
32
|
+
* Query a search index to find matching row groups from the source parquet.
|
|
33
|
+
* Returns undefined if query is empty so the search index is not used.
|
|
34
|
+
*
|
|
35
|
+
* @param {QueryIndexOptions} options
|
|
36
|
+
* @returns {Promise<QueryResult | undefined>}
|
|
37
|
+
*/
|
|
38
|
+
export async function queryIndex({ query, indexFile, indexMetadata, prefix = true }) {
|
|
39
|
+
// Tokenize the query using the same logic as indexing
|
|
40
|
+
const queryTerms = tokenize(query)
|
|
41
|
+
if (queryTerms.length === 0) return undefined
|
|
42
|
+
|
|
43
|
+
// Read index kv metadata
|
|
44
|
+
indexMetadata ??= await parquetMetadataAsync(indexFile)
|
|
45
|
+
const kvMetadata = indexMetadata.key_value_metadata || []
|
|
46
|
+
const { blockSize, textColumns, sourceByteLength, sourceRows } = parseKvMetadata(kvMetadata)
|
|
47
|
+
|
|
48
|
+
// Read index rows matching any of the query terms
|
|
49
|
+
const indexRows = await parquetQuery({
|
|
50
|
+
file: indexFile,
|
|
51
|
+
metadata: indexMetadata,
|
|
52
|
+
// use hyparquet pushdown filtering
|
|
53
|
+
filter: termsFilter(queryTerms, prefix),
|
|
54
|
+
})
|
|
55
|
+
|
|
56
|
+
// Pre-compute corpusDocFreq by summing docCount per term
|
|
57
|
+
/** @type {Map<string, number>} */
|
|
58
|
+
const corpusDocFreq = new Map()
|
|
59
|
+
for (const row of indexRows) {
|
|
60
|
+
const prev = corpusDocFreq.get(row.term) || 0
|
|
61
|
+
corpusDocFreq.set(row.term, prev + row.docCount)
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
// Map to accumulate scores per blockId
|
|
65
|
+
/** @type {Map<number, number>} */
|
|
66
|
+
const blockScores = new Map()
|
|
67
|
+
// Map to accumulate term statistics per blockId
|
|
68
|
+
/** @type {Map<number, TermResults>} */
|
|
69
|
+
const blockTerms = new Map()
|
|
70
|
+
|
|
71
|
+
// For each query term, find matching blocks and accumulate scores
|
|
72
|
+
for (const queryTerm of queryTerms) {
|
|
73
|
+
for (const indexRow of indexRows) {
|
|
74
|
+
// Check if this index term matches (exact or prefix)
|
|
75
|
+
const matches = prefix
|
|
76
|
+
? indexRow.term.startsWith(queryTerm)
|
|
77
|
+
: indexRow.term === queryTerm
|
|
78
|
+
if (matches) {
|
|
79
|
+
const currentScore = blockScores.get(indexRow.blockId) || 0
|
|
80
|
+
|
|
81
|
+
// Use actual index term's corpus doc freq for scoring
|
|
82
|
+
const termCorpusDocFreq = corpusDocFreq.get(indexRow.term) || 0
|
|
83
|
+
|
|
84
|
+
// BM25 scoring
|
|
85
|
+
// IDF component: log((N - df + 0.5) / (df + 0.5) + 1)
|
|
86
|
+
const idf = Math.log((sourceRows - termCorpusDocFreq + 0.5) / (termCorpusDocFreq + 0.5) + 1)
|
|
87
|
+
|
|
88
|
+
// BM25 parameters
|
|
89
|
+
const k1 = 1.2 // controls term frequency saturation
|
|
90
|
+
const b = 0.75 // controls length normalization
|
|
91
|
+
|
|
92
|
+
// TF component with saturation and length normalization
|
|
93
|
+
const tf = indexRow.termFreq
|
|
94
|
+
const tfComponent = tf * (k1 + 1) / (tf + k1 * (1 - b + b * indexRow.docCount / blockSize))
|
|
95
|
+
|
|
96
|
+
const termScore = idf * tfComponent
|
|
97
|
+
|
|
98
|
+
blockScores.set(indexRow.blockId, currentScore + termScore)
|
|
99
|
+
|
|
100
|
+
// Collect term statistics
|
|
101
|
+
if (!blockTerms.has(indexRow.blockId)) {
|
|
102
|
+
blockTerms.set(indexRow.blockId, {})
|
|
103
|
+
}
|
|
104
|
+
const terms = blockTerms.get(indexRow.blockId)
|
|
105
|
+
if (!terms) continue
|
|
106
|
+
terms[indexRow.term] = {
|
|
107
|
+
docs: indexRow.docCount,
|
|
108
|
+
frequency: indexRow.termFreq,
|
|
109
|
+
idf,
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
// Convert block scores to BlockResults
|
|
116
|
+
/** @type {BlockResult[]} */
|
|
117
|
+
const blocks = []
|
|
118
|
+
const numRows = Number(indexMetadata.num_rows)
|
|
119
|
+
for (const [blockId, score] of blockScores.entries()) {
|
|
120
|
+
const rowStart = blockId * blockSize
|
|
121
|
+
const rowEnd = Math.min((blockId + 1) * blockSize, numRows)
|
|
122
|
+
const terms = blockTerms.get(blockId) || {}
|
|
123
|
+
blocks.push({ blockId, rowStart, rowEnd, score, terms })
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
return { blocks, textColumns, sourceByteLength }
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
/**
|
|
130
|
+
* Parse key-value metadata from the index file
|
|
131
|
+
*
|
|
132
|
+
* @param {KeyValue[]} kvMetadata
|
|
133
|
+
* @returns {HypGrepMetadata}
|
|
134
|
+
*/
|
|
135
|
+
export function parseKvMetadata(kvMetadata) {
|
|
136
|
+
let blockSize = defaultBlockSize
|
|
137
|
+
/** @type {string[]} */
|
|
138
|
+
let textColumns = []
|
|
139
|
+
/** @type {number | undefined} */
|
|
140
|
+
let sourceByteLength
|
|
141
|
+
/** @type {number | undefined} */
|
|
142
|
+
let sourceRows
|
|
143
|
+
|
|
144
|
+
for (const { key, value } of kvMetadata) {
|
|
145
|
+
if (key === 'hypgrep.block_size') {
|
|
146
|
+
blockSize = Number(value)
|
|
147
|
+
}
|
|
148
|
+
if (key === 'hypgrep.version') {
|
|
149
|
+
if (Number(value) !== hypGrepVersion) {
|
|
150
|
+
throw new Error(`Unsupported hypgrep version ${value}`)
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
if (key === 'hypgrep.text_columns' && value) {
|
|
154
|
+
textColumns = value.split(',')
|
|
155
|
+
}
|
|
156
|
+
if (key === 'hypgrep.source_rows') {
|
|
157
|
+
sourceRows = Number(value)
|
|
158
|
+
}
|
|
159
|
+
if (key === 'hypgrep.source_bytelength') {
|
|
160
|
+
sourceByteLength = Number(value)
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
if (sourceRows === undefined) {
|
|
164
|
+
throw new Error('Missing hypgrep.source_rows in index metadata')
|
|
165
|
+
}
|
|
166
|
+
if (sourceByteLength === undefined) {
|
|
167
|
+
throw new Error('Missing hypgrep.source_bytelength in index metadata')
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
return { blockSize, textColumns, sourceByteLength, sourceRows }
|
|
171
|
+
}
|
package/src/stemmer.js
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
|
|
2
|
+
const vowels = 'aeiouy'
|
|
3
|
+
|
|
4
|
+
// ordered longest-first
|
|
5
|
+
const suffixRules = [
|
|
6
|
+
{ suffix: 'ing', minStem: 4, needsVowel: true },
|
|
7
|
+
{ suffix: 'ed', minStem: 3, needsVowel: true },
|
|
8
|
+
{ suffix: 'ly', minStem: 3 },
|
|
9
|
+
// handle plural-ish endings carefully
|
|
10
|
+
{ suffix: 'es', minStem: 3, needsVowel: true, plural: true },
|
|
11
|
+
{ suffix: 's', minStem: 3, needsVowel: true, plural: true },
|
|
12
|
+
// bad with i/y: er, ers, est, ies
|
|
13
|
+
// other options: ment, ingly, edly, ness
|
|
14
|
+
]
|
|
15
|
+
|
|
16
|
+
/**
|
|
17
|
+
* Simple prefix stemmer, removes common English suffixes.
|
|
18
|
+
* Based on a simplified version of the Porter stemming algorithm.
|
|
19
|
+
* Importantly, only removes suffixes.
|
|
20
|
+
*
|
|
21
|
+
* @param {string} term - lowercase word to stem
|
|
22
|
+
* @returns {string} stemmed word
|
|
23
|
+
*/
|
|
24
|
+
export function stemmer(term) {
|
|
25
|
+
// too short to bother
|
|
26
|
+
if (term.length < 4) return term
|
|
27
|
+
|
|
28
|
+
// skip anything that isn't a simple lowercase word
|
|
29
|
+
if (!isLowerAlpha(term)) return term
|
|
30
|
+
|
|
31
|
+
for (let i = 0; i < suffixRules.length; i += 1) {
|
|
32
|
+
const rule = suffixRules[i]
|
|
33
|
+
const { suffix } = rule
|
|
34
|
+
|
|
35
|
+
if (!term.endsWith(suffix)) continue
|
|
36
|
+
|
|
37
|
+
const stem = term.slice(0, term.length - suffix.length)
|
|
38
|
+
if (stem.length < rule.minStem) continue
|
|
39
|
+
|
|
40
|
+
if (rule.needsVowel && !hasVowel(stem)) continue
|
|
41
|
+
|
|
42
|
+
if (rule.plural) {
|
|
43
|
+
// fix: class, boss
|
|
44
|
+
if (term.endsWith('ss')) continue
|
|
45
|
+
// fix: virus, status
|
|
46
|
+
if (term.endsWith('us')) continue
|
|
47
|
+
// fix: this, analysis
|
|
48
|
+
if (term.endsWith('is')) continue
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
return stem
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
return term
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
/**
|
|
58
|
+
* @param {string} s
|
|
59
|
+
* @returns {boolean}
|
|
60
|
+
*/
|
|
61
|
+
function hasVowel(s) {
|
|
62
|
+
for (let i = 0; i < s.length; i += 1) {
|
|
63
|
+
if (vowels.includes(s[i])) return true
|
|
64
|
+
}
|
|
65
|
+
return false
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
/**
|
|
69
|
+
* @param {string} s
|
|
70
|
+
* @returns {boolean}
|
|
71
|
+
*/
|
|
72
|
+
function isLowerAlpha(s) {
|
|
73
|
+
for (let i = 0; i < s.length; i += 1) {
|
|
74
|
+
const code = s.charCodeAt(i)
|
|
75
|
+
if (code < 97 || code > 122) return false
|
|
76
|
+
}
|
|
77
|
+
return true
|
|
78
|
+
}
|
package/src/tokenize.js
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
import { stemmer } from './stemmer.js'
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Common English stop words to filter from index.
|
|
5
|
+
* These high-frequency, low-value words are excluded to reduce index size.
|
|
6
|
+
*/
|
|
7
|
+
const STOP_WORDS = new Set([
|
|
8
|
+
'the', 'be', 'to', 'of', 'and', 'a', 'in', 'that', 'have', 'it',
|
|
9
|
+
'for', 'not', 'on', 'with', 'he', 'as', 'you', 'do', 'at', 'this',
|
|
10
|
+
'but', 'his', 'by', 'from', 'they', 'we', 'say', 'her', 'she', 'or',
|
|
11
|
+
'an', 'will', 'my', 'one', 'all', 'would', 'there', 'their', 'what',
|
|
12
|
+
'so', 'up', 'out', 'if', 'about', 'who', 'get', 'which', 'go', 'me',
|
|
13
|
+
'when', 'make', 'can', 'like', 'no', 'just', 'him', 'know', 'take',
|
|
14
|
+
'into', 'your', 'some', 'could', 'them', 'than', 'then', 'now', 'only',
|
|
15
|
+
'its', 'also', 'other', 'how', 'our', 'may', 'these', 'was', 'been',
|
|
16
|
+
'has', 'had', 'are', 'is', 'am', 'were', 'does', 'did', 'being',
|
|
17
|
+
])
|
|
18
|
+
|
|
19
|
+
/**
|
|
20
|
+
* Split camelCase and PascalCase words by inserting spaces before uppercase letters.
|
|
21
|
+
* Converts "parseUserInput" to "parse User Input", "XMLParser" to "XMLParser", etc.
|
|
22
|
+
*
|
|
23
|
+
* @param {string} text
|
|
24
|
+
* @returns {string}
|
|
25
|
+
*/
|
|
26
|
+
function splitCamelCase(text) {
|
|
27
|
+
// Insert space before uppercase letters that follow lowercase letters or digits
|
|
28
|
+
return text.replace(/([a-z0-9])([A-Z])/g, '$1 $2')
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
/**
|
|
32
|
+
* Normalize Unicode text by removing diacritics/accents.
|
|
33
|
+
* Converts "café" to "cafe", "résumé" to "resume", etc.
|
|
34
|
+
*
|
|
35
|
+
* @param {string} text
|
|
36
|
+
* @returns {string}
|
|
37
|
+
*/
|
|
38
|
+
function normalizeUnicode(text) {
|
|
39
|
+
// NFD decomposes combined characters into base + combining marks
|
|
40
|
+
// Then remove combining marks (Unicode category: Mark, Nonspacing)
|
|
41
|
+
return text.normalize('NFD').replace(/[\u0300-\u036f]/g, '')
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
/**
|
|
45
|
+
* Tokenize text into normalized terms.
|
|
46
|
+
* Splits camelCase, lowercases, normalizes Unicode, splits on non-alphanumeric boundaries,
|
|
47
|
+
* filters stop words, and applies Porter stemming.
|
|
48
|
+
*
|
|
49
|
+
* @param {string} text
|
|
50
|
+
* @returns {string[]}
|
|
51
|
+
*/
|
|
52
|
+
export function tokenize(text) {
|
|
53
|
+
// Split camelCase/PascalCase before normalization
|
|
54
|
+
const split = splitCamelCase(text)
|
|
55
|
+
// Normalize Unicode (remove accents) before lowercasing
|
|
56
|
+
const normalized = normalizeUnicode(split)
|
|
57
|
+
const lower = normalized.toLowerCase()
|
|
58
|
+
|
|
59
|
+
// Split on non-alphanumeric boundaries
|
|
60
|
+
const rawTokens = lower.split(/[^a-z0-9]+/g)
|
|
61
|
+
const tokens = []
|
|
62
|
+
|
|
63
|
+
for (const token of rawTokens) {
|
|
64
|
+
if (!token) continue
|
|
65
|
+
if (token.length < 2) continue
|
|
66
|
+
if (STOP_WORDS.has(token)) continue
|
|
67
|
+
// Apply Porter stemming to reduce words to their root form
|
|
68
|
+
tokens.push(stemmer(token))
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
return tokens
|
|
72
|
+
}
|
package/src/types.d.ts
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
import type { AsyncBuffer, asyncBufferFromUrl, Compressors, FileMetaData, ParquetParsers, ParquetQueryFilter } from 'hyparquet'
|
|
2
|
+
import type { Writer } from 'hyparquet-writer'
|
|
3
|
+
|
|
4
|
+
export interface CreateIndexOptions {
|
|
5
|
+
sourceFile: AsyncBuffer // file reader for the source parquet file
|
|
6
|
+
sourceMetadata?: FileMetaData // optional source parquet metadata
|
|
7
|
+
indexFile: Writer // file writer for the output index parquet file
|
|
8
|
+
blockSize?: number // number of rows per logical block
|
|
9
|
+
indexRowGroupSize?: number // row group size in the index file
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
export interface QueryIndexOptions {
|
|
13
|
+
query: string // the search query string
|
|
14
|
+
indexFile: AsyncBuffer // file reader for the index parquet file
|
|
15
|
+
indexMetadata?: FileMetaData // optional index parquet metadata
|
|
16
|
+
prefix?: boolean // enable prefix matching (default: true)
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
export interface ParquetSearchOptions {
|
|
20
|
+
query: string // the search query string
|
|
21
|
+
url: string // URL or file path to the source parquet file
|
|
22
|
+
limit?: number // maximum number of matching rows to return
|
|
23
|
+
prefix?: boolean // enable prefix matching (default: true)
|
|
24
|
+
|
|
25
|
+
// fetch options
|
|
26
|
+
signal?: AbortSignal // optional AbortSignal to cancel the search operation
|
|
27
|
+
asyncBufferFactory?: typeof asyncBufferFromUrl // optional factory to create AsyncBuffers for source and index files
|
|
28
|
+
sourceFile?: AsyncBuffer // file reader for the source parquet file
|
|
29
|
+
sourceMetadata?: FileMetaData // optional source parquet metadata
|
|
30
|
+
indexFile?: AsyncBuffer // file reader for the index parquet file
|
|
31
|
+
indexMetadata?: FileMetaData // optional index parquet metadata
|
|
32
|
+
|
|
33
|
+
// misc options passed through to hyparquet
|
|
34
|
+
columns?: string[]
|
|
35
|
+
filter?: ParquetQueryFilter
|
|
36
|
+
compressors?: Compressors
|
|
37
|
+
utf8?: boolean
|
|
38
|
+
parsers?: ParquetParsers
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
/**
|
|
42
|
+
* Represents a single entry in the search index.
|
|
43
|
+
*/
|
|
44
|
+
export interface IndexRow {
|
|
45
|
+
term: string // normalized search term
|
|
46
|
+
blockId: number // logical block ID this term appears in
|
|
47
|
+
docCount: number // number of documents in the block containing this term
|
|
48
|
+
termFreq: number // total frequency of the term in the block
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
export interface QueryResult {
|
|
52
|
+
blocks: BlockResult[] // list of matching blocks
|
|
53
|
+
textColumns: string[] // list of indexed text columns in the source parquet
|
|
54
|
+
sourceByteLength: number // byte length of the source parquet file
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
/**
|
|
58
|
+
* Represents a matching block of rows from the source parquet.
|
|
59
|
+
*/
|
|
60
|
+
export interface BlockResult {
|
|
61
|
+
blockId: number
|
|
62
|
+
rowStart: number // starting row index (inclusive) in the source parquet
|
|
63
|
+
rowEnd: number // ending row index (exclusive) in the source parquet
|
|
64
|
+
score: number // relevance score based on term frequency
|
|
65
|
+
terms: TermResults // per-term statistics
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
export type TermResults = Record<string, TermResult>
|
|
69
|
+
interface TermResult {
|
|
70
|
+
docs: number // number of documents in the block containing this term
|
|
71
|
+
frequency: number // total occurrences of the term in the block
|
|
72
|
+
idf: number // inverse document frequency for this term
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
/**
|
|
76
|
+
* Metadata about the source parquet file and index settings.
|
|
77
|
+
* Parsed from the index file KV metadata
|
|
78
|
+
*/
|
|
79
|
+
export interface HypGrepMetadata {
|
|
80
|
+
blockSize: number // number of rows per logical block
|
|
81
|
+
textColumns: string[] // list of indexed text columns
|
|
82
|
+
sourceRows: number // number of rows in the source parquet file
|
|
83
|
+
sourceByteLength: number // byte length of the source parquet file
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
/**
|
|
87
|
+
* Statistics collected for a single logical block during index creation.
|
|
88
|
+
*/
|
|
89
|
+
export interface BlockStats {
|
|
90
|
+
termDocCount: Map<string, number> // number of documents containing each term
|
|
91
|
+
termFreqMap: Map<string, number> // total frequency of each term in the block
|
|
92
|
+
}
|
package/src/utils.js
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
import { parquetSchema } from 'hyparquet'
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* @import {FileMetaData, SchemaTree} from 'hyparquet'
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
/**
|
|
8
|
+
* Get string column names from the parquet schema.
|
|
9
|
+
*
|
|
10
|
+
* @param {FileMetaData} metadata
|
|
11
|
+
* @returns {string[]}
|
|
12
|
+
*/
|
|
13
|
+
export function getTextColumnsFromSchema(metadata) {
|
|
14
|
+
const schemaTree = parquetSchema(metadata)
|
|
15
|
+
/** @type {string[]} */
|
|
16
|
+
const textColumns = []
|
|
17
|
+
|
|
18
|
+
/**
|
|
19
|
+
* @param {SchemaTree} node
|
|
20
|
+
*/
|
|
21
|
+
function traverse(node) {
|
|
22
|
+
const { element, children } = node
|
|
23
|
+
|
|
24
|
+
// Check if this is a string column
|
|
25
|
+
const isString =
|
|
26
|
+
element.converted_type === 'UTF8' ||
|
|
27
|
+
element.logical_type?.type === 'STRING'
|
|
28
|
+
|
|
29
|
+
// If it's a leaf node (no children) and it's a string, add it
|
|
30
|
+
if (isString && (!children || children.length === 0)) {
|
|
31
|
+
textColumns.push(element.name)
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
// Traverse children
|
|
35
|
+
if (children) {
|
|
36
|
+
for (const child of children) {
|
|
37
|
+
traverse(child)
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
traverse(schemaTree)
|
|
43
|
+
return textColumns
|
|
44
|
+
}
|