hypgrep 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/LICENSE ADDED
@@ -0,0 +1,7 @@
1
+ The MIT License (MIT)
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4
+
5
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6
+
7
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,90 @@
1
+ # HypGrep
2
+
3
+ [![mit license](https://img.shields.io/badge/License-MIT-orange.svg)](https://opensource.org/licenses/MIT)
4
+ ![coverage](https://img.shields.io/badge/Coverage-95-darkred)
5
+
6
+ Build a compact full-text search index for a Parquet file using [`hyparquet`](https://github.com/hyparam/hyparquet) and [`hyparquet-writer`](https://github.com/hyparam/hyparquet-writer).
7
+
8
+ ## Why?
9
+
10
+ Enable efficient full-text search on large Parquet datasets from any client without a server. Store your Parquet dataset on S3, generate a compact index file, and query it directly from a browser or other clients using HTTP range requests. The index tells you exactly which row blocks to fetch, so you only download the data you need.
11
+
12
+ Perfect for serverless architectures where you want to offer search capabilities without managing infrastructure.
13
+
14
+ ## CLI usage
15
+
16
+ ```bash
17
+ npx hypgrep dataset.parquet [dataset.index.parquet]
18
+ ```
19
+
20
+ To install as a system-wide CLI tool:
21
+
22
+ ```bash
23
+ npm install -g hypgrep
24
+ hypgrep dataset.parquet [dataset.index.parquet]
25
+ ```
26
+
27
+ ## Find rows in a parquet file in JavaScript
28
+
29
+ Use `parquetFind` to find rows matching a query while preserving natural row order (like Ctrl+F):
30
+
31
+ ```javascript
32
+ import { parquetFind } from 'hypgrep'
33
+
34
+ for await (const row of parquetFind({
35
+ query: 'serverless',
36
+ url: 'https://s3.hyperparam.app/hypgrep/wiki_en.parquet',
37
+ })) {
38
+ console.log(row) // { title: '...', text: '...' }
39
+ }
40
+ ```
41
+
42
+ ## Ranked search
43
+
44
+ Use `parquetSearch` to rank results by BM25 relevance score (like a search engine):
45
+
46
+ ```javascript
47
+ import { parquetSearch } from 'hypgrep'
48
+
49
+ for await (const row of parquetSearch({
50
+ query: 'serverless',
51
+ url: 'https://s3.hyperparam.app/hypgrep/wiki_en.parquet',
52
+ })) {
53
+ console.log(row) // highest relevance first
54
+ }
55
+ ```
56
+
57
+ ## Create an index in JavaScript
58
+
59
+ ```javascript
60
+ import { asyncBufferFromFile } from 'hyparquet'
61
+ import { fileWriter } from 'hyparquet-writer'
62
+ import { createIndex } from 'hypgrep'
63
+
64
+ // Generate dataset.index.parquet from dataset.parquet
65
+ const sourceFile = await asyncBufferFromFile('dataset.parquet')
66
+ const indexFile = fileWriter('dataset.index.parquet')
67
+ await createIndex({ sourceFile, indexFile })
68
+ ```
69
+
70
+ ## Local parquet files
71
+
72
+ To search against local parquet files, provide an `asyncBufferFactory` that loads the file from the local filesystem:
73
+
74
+ ```js
75
+ import { asyncBufferFromFile } from 'hyparquet'
76
+ import { parquetFind } from 'hypgrep'
77
+
78
+ // Loads parquet file from local filesystem
79
+ function asyncBufferFactory({ url }) {
80
+ return asyncBufferFromFile(url)
81
+ }
82
+
83
+ for await (const row of parquetFind({
84
+ query: 'serverless',
85
+ url: 'dataset.parquet',
86
+ asyncBufferFactory,
87
+ })) {
88
+ console.log(row)
89
+ }
90
+ ```
package/bin/cli.js ADDED
@@ -0,0 +1,60 @@
1
+ #!/usr/bin/env node
2
+
3
+ import { fileWriter } from 'hyparquet-writer'
4
+ import { pathToFileURL } from 'node:url'
5
+ import { createIndex } from '../src/createIndex.js'
6
+ import { inspect } from './inspect.js'
7
+ import { loadParquet } from './load.js'
8
+
9
+ /**
10
+ * Command line entry point.
11
+ */
12
+ async function main() {
13
+ const sourcePath = process.argv[2]
14
+ if (!sourcePath) {
15
+ console.error('Usage: npx hypgrep dataset.parquet [dataset.index.parquet]')
16
+ process.exit(1)
17
+ }
18
+
19
+ // Load source
20
+ const { file: sourceFile, metadata: sourceMetadata } = await loadParquet(sourcePath)
21
+
22
+ // Check for HypGrep file
23
+ if (sourceMetadata.key_value_metadata?.some(kv => kv.key === 'hypgrep.version')) {
24
+ console.error('Input is already a HypGrep file')
25
+ console.error(`HypGrep: ${sourcePath}`)
26
+ await inspect({ indexPath: sourcePath })
27
+ process.exit(1)
28
+ }
29
+
30
+ // Infer index path if needed
31
+ const indexPath = process.argv[3] ?? `${sourcePath.replace(/\.parquet$/i, '')}.index.parquet`
32
+ if (indexPath.startsWith('http://') || indexPath.startsWith('https://')) {
33
+ console.error('Must specify local index path to generate index for a remote source')
34
+ process.exit(1)
35
+ }
36
+
37
+ console.log(`Source: ${sourcePath}`)
38
+ console.log(`Source: ${sourceFile.byteLength.toLocaleString()} bytes`)
39
+ console.log(`Source: ${sourceMetadata.num_rows.toLocaleString()} rows`)
40
+ console.log(`Source: ${sourceMetadata.row_groups.length.toLocaleString()} rowgroups`)
41
+ console.log(`HypGrep: ${indexPath}`)
42
+ const startTime = performance.now()
43
+
44
+ // Create index
45
+ const indexFile = fileWriter(indexPath)
46
+ await createIndex({ sourceFile, sourceMetadata, indexFile })
47
+
48
+ // Print statistics
49
+ await inspect({ sourcePath, indexPath })
50
+
51
+ console.log()
52
+ console.log(`Created ${indexPath} in ${((performance.now() - startTime) / 1000).toFixed(1)} s`)
53
+ }
54
+
55
+ if (import.meta.url === pathToFileURL(process.argv[1]).href) {
56
+ main().catch(error => {
57
+ console.error(error.message)
58
+ process.exit(1)
59
+ })
60
+ }
package/bin/inspect.js ADDED
@@ -0,0 +1,67 @@
1
+ import { parseKvMetadata } from '../src/queryIndex.js'
2
+ import { loadParquet } from './load.js'
3
+
4
+ /**
5
+ * @import { FileMetaData } from 'hyparquet'
6
+ */
7
+
8
+ /**
9
+ * Print statistics about an index file.
10
+ *
11
+ * @param {object} options
12
+ * @param {string} options.indexPath
13
+ * @param {string} [options.sourcePath]
14
+ */
15
+ export async function inspect({ indexPath, sourcePath }) {
16
+ // Load the index file
17
+ const { file: indexFile, metadata: indexMetadata } = await loadParquet(indexPath)
18
+
19
+ console.log(`HypGrep: ${indexFile.byteLength.toLocaleString()} bytes`)
20
+ console.log(`HypGrep: ${indexMetadata.num_rows.toLocaleString()} rows`)
21
+ console.log(`HypGrep: ${indexMetadata.row_groups.length.toLocaleString()} index rowgroups`)
22
+
23
+ // Calculate index stats
24
+ let avgIndexRowGroupSize = 0
25
+ for (const rowGroup of indexMetadata.row_groups) {
26
+ avgIndexRowGroupSize += Number(rowGroup.total_byte_size)
27
+ }
28
+ avgIndexRowGroupSize /= indexMetadata.row_groups.length || 1
29
+
30
+ // Guess source path if not provided
31
+ sourcePath ??= indexPath.replace(/\.index\.parquet$/i, '.parquet')
32
+ const sourceMetadata = await loadParquet(sourcePath)
33
+ .then(({ metadata }) => metadata)
34
+ .catch(() => undefined)
35
+ if (!sourceMetadata) {
36
+ throw new Error(`Failed to load source parquet: ${sourcePath}`)
37
+ }
38
+
39
+ // Calculate source stats
40
+ const { blockSize } = parseKvMetadata(indexMetadata.key_value_metadata ?? [])
41
+ const numBlocks = Math.ceil(Number(sourceMetadata.num_rows) / blockSize)
42
+
43
+ console.log(`HypGrep: ${sourceMetadata.row_groups.length.toLocaleString()} source rowgroups`)
44
+ console.log(`HypGrep: ${numBlocks.toLocaleString()} blocks`)
45
+
46
+ let avgSourceRowGroupSize = 0
47
+ for (const rowGroup of sourceMetadata.row_groups) {
48
+ avgSourceRowGroupSize += Number(rowGroup.total_byte_size)
49
+ }
50
+ avgSourceRowGroupSize /= sourceMetadata.row_groups.length || 1
51
+
52
+ // Average single-term query size
53
+ const indexMetadataLength = Math.round(indexMetadata.metadata_length).toLocaleString()
54
+ const indexAvgRowGroup = Math.round(avgIndexRowGroupSize).toLocaleString()
55
+ const sourceMetadataLength = Math.round(sourceMetadata.metadata_length).toLocaleString()
56
+ const sourceAvgRowGroup = Math.round(avgSourceRowGroupSize).toLocaleString()
57
+ const queryAvg = Math.round(
58
+ indexMetadata.metadata_length + avgIndexRowGroupSize + avgSourceRowGroupSize
59
+ ).toLocaleString()
60
+
61
+ // Print statistics
62
+ console.log(`Query: ~${queryAvg} bytes`)
63
+ console.log(` + ${indexMetadataLength} bytes (index metadata)`)
64
+ console.log(` + ${indexAvgRowGroup} bytes (index rowgroup)`)
65
+ console.log(` + ${sourceMetadataLength} bytes (source metadata)`)
66
+ console.log(` + ${sourceAvgRowGroup} bytes (source rowgroup)`)
67
+ }
package/bin/load.js ADDED
@@ -0,0 +1,43 @@
1
+ import fs from 'node:fs/promises'
2
+ import { asyncBufferFromFile, asyncBufferFromUrl, parquetMetadataAsync } from 'hyparquet'
3
+
4
+ /**
5
+ * @import {AsyncBuffer, FileMetaData} from 'hyparquet'
6
+ */
7
+
8
+ /**
9
+ * @param {string} path
10
+ * @returns {Promise<{ file: AsyncBuffer, metadata: FileMetaData }>}
11
+ */
12
+ export async function loadParquet(path) {
13
+ /** @type {AsyncBuffer | undefined} */
14
+ let file
15
+ if (path.startsWith('http://') || path.startsWith('https://')) {
16
+ try {
17
+ file = await asyncBufferFromUrl({ url: path })
18
+ } catch (error) {
19
+ console.error(`Failed to load Parquet file from URL: ${path}`)
20
+ throw error
21
+ }
22
+ } else {
23
+ try {
24
+ await fs.access(path)
25
+ } catch {
26
+ throw new Error(`Parquet file not found: ${path}`)
27
+ }
28
+ try {
29
+ file = await asyncBufferFromFile(path)
30
+ } catch (error) {
31
+ console.error(`Failed to load Parquet file from path: ${path}`)
32
+ throw error
33
+ }
34
+ }
35
+ /** @type {FileMetaData | undefined} */
36
+ let metadata
37
+ try {
38
+ metadata = await parquetMetadataAsync(file)
39
+ } catch (error) {
40
+ throw new Error(`Failed to read Parquet metadata from file: ${path}`, { cause: error })
41
+ }
42
+ return { file, metadata }
43
+ }
package/package.json ADDED
@@ -0,0 +1,56 @@
1
+ {
2
+ "name": "hypgrep",
3
+ "version": "0.1.1",
4
+ "author": "Hyperparam",
5
+ "homepage": "https://hyperparam.app",
6
+ "license": "MIT",
7
+ "keywords": [
8
+ "parquet",
9
+ "index",
10
+ "search",
11
+ "full-text-search",
12
+ "hyparquet",
13
+ "serverless"
14
+ ],
15
+ "repository": {
16
+ "type": "git",
17
+ "url": "git+https://github.com/hyparam/hypgrep.git"
18
+ },
19
+ "type": "module",
20
+ "sideEffects": false,
21
+ "bin": {
22
+ "hypgrep": "./bin/cli.js"
23
+ },
24
+ "types": "src/index.d.ts",
25
+ "main": "src/index.js",
26
+ "export": {
27
+ ".": {
28
+ "import": "./src/index.js",
29
+ "types": "./src/index.d.ts"
30
+ }
31
+ },
32
+ "files": [
33
+ "bin",
34
+ "src"
35
+ ],
36
+ "scripts": {
37
+ "benchmark": "node benchmark.js",
38
+ "coverage": "vitest run --coverage --coverage.include=src",
39
+ "lint": "eslint",
40
+ "lint:fix": "eslint --fix",
41
+ "test": "vitest run"
42
+ },
43
+ "dependencies": {
44
+ "hyparquet": "1.25.8",
45
+ "hyparquet-compressors": "1.1.1",
46
+ "hyparquet-writer": "0.15.1"
47
+ },
48
+ "devDependencies": {
49
+ "@types/node": "25.9.1",
50
+ "@vitest/coverage-v8": "4.1.6",
51
+ "eslint": "9.39.4",
52
+ "eslint-plugin-jsdoc": "62.9.0",
53
+ "typescript": "6.0.3",
54
+ "vitest": "4.1.6"
55
+ }
56
+ }
@@ -0,0 +1,8 @@
1
+ // Version of the parquet index format
2
+ export const hypGrepVersion = 0
3
+
4
+ // Number of rows per virtual block
5
+ export const defaultBlockSize = 500
6
+
7
+ // Row group size in the index file
8
+ export const defaultIndexRowGroupSize = 40000
@@ -0,0 +1,162 @@
1
+ import { parquetMetadataAsync, parquetReadObjects } from 'hyparquet'
2
+ import { parquetWrite } from 'hyparquet-writer'
3
+ import { defaultBlockSize, defaultIndexRowGroupSize, hypGrepVersion } from './constants.js'
4
+ import { tokenize } from './tokenize.js'
5
+ import { getTextColumnsFromSchema } from './utils.js'
6
+
7
+ /**
8
+ * @import { BlockStats, CreateIndexOptions, IndexRow } from './types.js'
9
+ * @import { ColumnSource } from 'hyparquet-writer'
10
+ */
11
+
12
+ /**
13
+ * Create a full-text search index parquet next to the given parquet file.
14
+ *
15
+ * @param {CreateIndexOptions} options
16
+ * @returns {Promise<void>}
17
+ */
18
+ export async function createIndex({
19
+ sourceFile,
20
+ sourceMetadata,
21
+ indexFile,
22
+ blockSize = defaultBlockSize,
23
+ indexRowGroupSize = defaultIndexRowGroupSize,
24
+ }) {
25
+ const metadata = sourceMetadata ?? await parquetMetadataAsync(sourceFile)
26
+ const numRows = Number(metadata.num_rows)
27
+ const textColumns = getTextColumnsFromSchema(metadata)
28
+ if (textColumns.length === 0) {
29
+ throw new Error('No string columns found to index')
30
+ }
31
+
32
+ // Map from term -> array of entries (entries are in blockId order)
33
+ /** @type {Map<string, IndexRow[]>} */
34
+ const termIndex = new Map()
35
+
36
+ let blockId = 0
37
+ for (let rowStart = 0; rowStart < numRows; rowStart += blockSize) {
38
+ const rowEnd = Math.min(rowStart + blockSize, numRows)
39
+
40
+ const rows = await parquetReadObjects({
41
+ file: sourceFile,
42
+ metadata,
43
+ rowStart,
44
+ rowEnd,
45
+ columns: textColumns,
46
+ })
47
+
48
+ const { termDocCount, termFreqMap } = collectBlockStats(rows, textColumns)
49
+
50
+ // Build index entries for this block
51
+ for (const [term, docCount] of termDocCount.entries()) {
52
+ const termFreq = termFreqMap.get(term) || docCount
53
+ const entry = { term, blockId, docCount, termFreq }
54
+ const existing = termIndex.get(term)
55
+ if (existing) {
56
+ existing.push(entry)
57
+ } else {
58
+ termIndex.set(term, [entry])
59
+ }
60
+ }
61
+
62
+ blockId += 1
63
+ }
64
+
65
+ // Sort by term
66
+ const sortedTerms = Array.from(termIndex.keys()).sort()
67
+
68
+ // Flatten into sorted indexRows
69
+ /** @type {IndexRow[]} */
70
+ const indexRows = []
71
+ for (const term of sortedTerms) {
72
+ const entries = termIndex.get(term)
73
+ if (!entries) continue
74
+ indexRows.push(...entries)
75
+ }
76
+
77
+ const kvMetadata = [
78
+ { key: 'hypgrep.version', value: String(hypGrepVersion) },
79
+ { key: 'hypgrep.block_size', value: String(blockSize) },
80
+ { key: 'hypgrep.text_columns', value: textColumns.join(',') },
81
+ { key: 'hypgrep.source_rows', value: String(numRows) },
82
+ // Can save network requests on the source file
83
+ { key: 'hypgrep.source_bytelength', value: String(sourceFile.byteLength) },
84
+ ]
85
+
86
+ const columnData = buildColumnData(indexRows)
87
+ await parquetWrite({
88
+ writer: indexFile,
89
+ columnData,
90
+ rowGroupSize: indexRowGroupSize,
91
+ kvMetadata,
92
+ })
93
+ }
94
+
95
+ /**
96
+ * Collect term statistics for a single logical block of rows.
97
+ *
98
+ * @param {Record<string, any>[]} rows
99
+ * @param {string[]} textColumns
100
+ * @returns {BlockStats}
101
+ */
102
+ function collectBlockStats(rows, textColumns) {
103
+ const termDocCount = new Map()
104
+ const termFreqMap = new Map()
105
+
106
+ for (const row of rows) {
107
+ if (!row) continue
108
+
109
+ const seenInRow = new Set()
110
+
111
+ for (const columnName of textColumns) {
112
+ const value = row[columnName]
113
+ if (typeof value !== 'string' || value.length === 0) continue
114
+
115
+ const tokens = tokenize(value)
116
+
117
+ for (const token of tokens) {
118
+ seenInRow.add(token)
119
+ const prevFreq = termFreqMap.get(token) || 0
120
+ termFreqMap.set(token, prevFreq + 1)
121
+ }
122
+ }
123
+
124
+ for (const token of seenInRow) {
125
+ const prevDocCount = termDocCount.get(token) || 0
126
+ termDocCount.set(token, prevDocCount + 1)
127
+ }
128
+ }
129
+
130
+ return { termDocCount, termFreqMap }
131
+ }
132
+
133
+ /**
134
+ * Convert row-oriented index entries to column-oriented data for hyparquet-writer.
135
+ *
136
+ * @param {IndexRow[]} indexRows
137
+ * @returns {ColumnSource[]}
138
+ */
139
+ function buildColumnData(indexRows) {
140
+ const { length } = indexRows
141
+ const terms = new Array(length)
142
+ const blockIds = new Array(length)
143
+ const docCounts = new Array(length)
144
+ const termFreqs = new Array(length)
145
+
146
+ for (let i = 0; i < length; i += 1) {
147
+ const row = indexRows[i]
148
+ terms[i] = row.term
149
+ blockIds[i] = row.blockId
150
+ docCounts[i] = row.docCount
151
+ termFreqs[i] = row.termFreq
152
+ }
153
+
154
+ return [
155
+ // Delta byte array encoding works well for sorted string columns
156
+ { name: 'term', data: terms, type: 'STRING', encoding: 'DELTA_BYTE_ARRAY' },
157
+ // Delta binary packed works well for incrementing integers
158
+ { name: 'blockId', data: blockIds, type: 'INT32', encoding: 'DELTA_BINARY_PACKED' },
159
+ { name: 'docCount', data: docCounts, type: 'INT32' },
160
+ { name: 'termFreq', data: termFreqs, type: 'INT32' },
161
+ ]
162
+ }
package/src/index.d.ts ADDED
@@ -0,0 +1,29 @@
1
+ import type { CreateIndexOptions, ParquetSearchOptions, QueryIndexOptions, QueryResult } from './types.js'
2
+
3
+ export const hypGrepVersion: number
4
+
5
+ /**
6
+ * Create a full-text search index parquet next to the given parquet file.
7
+ */
8
+ export function createIndex(options: CreateIndexOptions): Promise<void>
9
+
10
+ /**
11
+ * Uses hypgrep to find rows matching a query from a source parquet file.
12
+ */
13
+ export function parquetFind(options: ParquetSearchOptions): AsyncGenerator<Record<string, any>, void, unknown>
14
+
15
+ /**
16
+ * Uses hypgrep to query a source parquet file and return matching rows ranked by relevance.
17
+ */
18
+ export function parquetSearch(options: ParquetSearchOptions): AsyncGenerator<Record<string, any>, void, unknown>
19
+
20
+ /**
21
+ * Query a search index to find matching row groups from the source parquet.
22
+ */
23
+ export function queryIndex(options: QueryIndexOptions): Promise<QueryResult>
24
+
25
+ /**
26
+ * Tokenize text into normalized terms.
27
+ * Lowercases and splits on non-alphanumeric boundaries.
28
+ */
29
+ export function tokenize(text: string): string[]
package/src/index.js ADDED
@@ -0,0 +1,6 @@
1
+ export { hypGrepVersion } from './constants.js'
2
+ export { createIndex } from './createIndex.js'
3
+ export { parquetFind } from './parquetFind.js'
4
+ export { parquetSearch } from './parquetSearch.js'
5
+ export { queryIndex } from './queryIndex.js'
6
+ export { tokenize } from './tokenize.js'
@@ -0,0 +1,113 @@
1
+ import { asyncBufferFromUrl, parquetMetadataAsync, parquetReadObjects } from 'hyparquet'
2
+ import { queryIndex } from './queryIndex.js'
3
+ import { tokenize } from './tokenize.js'
4
+
5
+ /**
6
+ * Find rows matching a query, maintaining natural row order.
7
+ *
8
+ * @import {ParquetSearchOptions, TermResults} from '../src/types.js'
9
+ * @param {ParquetSearchOptions} options
10
+ * @returns {AsyncGenerator<Record<string, any>, void, unknown>}
11
+ */
12
+ export async function* parquetFind({
13
+ query,
14
+ url,
15
+ limit = Infinity,
16
+ prefix = true,
17
+ signal,
18
+ asyncBufferFactory = asyncBufferFromUrl,
19
+ sourceFile,
20
+ sourceMetadata,
21
+ indexFile,
22
+ indexMetadata,
23
+ ...hyparquetOptions
24
+ }) {
25
+ if (!query || limit <= 0) return
26
+ signal?.throwIfAborted()
27
+ // Query the index to get matching blocks
28
+ indexFile ??= await asyncBufferFactory({ url: `${url.replace(/\.parquet$/i, '')}.index.parquet` })
29
+ const queryResult = await queryIndex({ query, indexFile, indexMetadata, prefix })
30
+ if (!queryResult) return
31
+ const { blocks, textColumns, sourceByteLength } = queryResult
32
+
33
+ // If no matching blocks, return empty result
34
+ if (blocks.length === 0) return
35
+
36
+ // Sort blocks by blockId for natural row order
37
+ blocks.sort((a, b) => a.blockId - b.blockId)
38
+ signal?.throwIfAborted()
39
+
40
+ // Construct source file if not provided, use byteLength from index metadata if available
41
+ const file = sourceFile ?? await asyncBufferFactory({ url, byteLength: sourceByteLength })
42
+ // Get source metadata once before loop only if needed
43
+ const metadata = sourceMetadata ?? await parquetMetadataAsync(file)
44
+
45
+ // Tokenize query terms for matching
46
+ const queryTerms = new Set(tokenize(query))
47
+
48
+ // For each matching block (in natural order), read rows from the source parquet
49
+ let count = 0
50
+ for (const block of blocks) {
51
+ signal?.throwIfAborted()
52
+ const blockRows = await parquetReadObjects({
53
+ ...hyparquetOptions,
54
+ file,
55
+ metadata,
56
+ rowStart: block.rowStart,
57
+ rowEnd: block.rowEnd,
58
+ useOffsetIndex: true,
59
+ })
60
+
61
+ // Yield matching rows in natural order (no sorting)
62
+ for (let i = 0; i < blockRows.length; i++) {
63
+ const row = blockRows[i]
64
+ if (matchesRow(row, textColumns, queryTerms, block.terms, prefix)) {
65
+ yield { __index__: block.rowStart + i, ...row }
66
+ if (++count >= limit) return
67
+ }
68
+ }
69
+ }
70
+ }
71
+
72
+ /**
73
+ * Check if a row matches any of the query terms.
74
+ *
75
+ * @param {Record<string, any>} row
76
+ * @param {string[]} textColumns
77
+ * @param {Set<string>} queryTerms
78
+ * @param {TermResults} termStats
79
+ * @param {boolean} prefix
80
+ * @returns {boolean}
81
+ */
82
+ function matchesRow(row, textColumns, queryTerms, termStats, prefix) {
83
+ const rowTokens = new Set()
84
+
85
+ // Collect all tokens from text columns
86
+ for (const col of textColumns) {
87
+ const value = row[col]
88
+ if (typeof value === 'string') {
89
+ for (const token of tokenize(value)) {
90
+ rowTokens.add(token)
91
+ }
92
+ }
93
+ }
94
+
95
+ // Check if any query term matches
96
+ for (const queryTerm of queryTerms) {
97
+ if (prefix) {
98
+ // Prefix matching: find row tokens that start with query term
99
+ for (const token of rowTokens) {
100
+ if (token.startsWith(queryTerm) && termStats[token]) {
101
+ return true
102
+ }
103
+ }
104
+ } else {
105
+ // Exact matching
106
+ if (rowTokens.has(queryTerm) && termStats[queryTerm]) {
107
+ return true
108
+ }
109
+ }
110
+ }
111
+
112
+ return false
113
+ }
@@ -0,0 +1,128 @@
1
+ import { asyncBufferFromUrl, parquetMetadataAsync, parquetReadObjects } from 'hyparquet'
2
+ import { queryIndex } from './queryIndex.js'
3
+ import { tokenize } from './tokenize.js'
4
+
5
+ /**
6
+ * Uses the hypgrep to query a source parquet file and return matching rows.
7
+ *
8
+ * @import {ParquetSearchOptions, TermResults} from '../src/types.js'
9
+ * @param {ParquetSearchOptions} options
10
+ * @returns {AsyncGenerator<Record<string, any>, void, unknown>}
11
+ */
12
+ export async function* parquetSearch({
13
+ query,
14
+ url,
15
+ limit = Infinity,
16
+ prefix = true,
17
+ signal,
18
+ asyncBufferFactory = asyncBufferFromUrl,
19
+ sourceFile,
20
+ sourceMetadata,
21
+ indexFile,
22
+ indexMetadata,
23
+ ...hyparquetOptions
24
+ }) {
25
+ if (!query || limit <= 0) return
26
+ signal?.throwIfAborted()
27
+ // Query the index to get matching blocks
28
+ indexFile ??= await asyncBufferFactory({ url: `${url.replace(/\.parquet$/i, '')}.index.parquet` })
29
+ const queryResult = await queryIndex({ query, indexFile, indexMetadata, prefix })
30
+ if (!queryResult) return
31
+ const { blocks, textColumns, sourceByteLength } = queryResult
32
+
33
+ // Sort blocks by score descending (most relevant first)
34
+ blocks.sort((a, b) => b.score - a.score)
35
+
36
+ // If no matching blocks, return empty result
37
+ if (blocks.length === 0) return
38
+ signal?.throwIfAborted()
39
+
40
+ // Construct source file if not provided, use byteLength from index metadata if available
41
+ const file = sourceFile ?? await asyncBufferFactory({ url, byteLength: sourceByteLength })
42
+ // Get source metadata once before loop only if needed
43
+ const metadata = sourceMetadata ?? await parquetMetadataAsync(file)
44
+
45
+ // Tokenize query terms for matching
46
+ const queryTerms = new Set(tokenize(query))
47
+
48
+ // For each matching block, read rows from the source parquet
49
+ let count = 0
50
+ for (const block of blocks) {
51
+ signal?.throwIfAborted()
52
+ const blockRows = await parquetReadObjects({
53
+ ...hyparquetOptions,
54
+ file,
55
+ metadata,
56
+ rowStart: block.rowStart,
57
+ rowEnd: block.rowEnd,
58
+ useOffsetIndex: true,
59
+ })
60
+
61
+ // Score and collect matching rows within the block
62
+ /** @type {{index: number, row: Record<string, any>, score: number}[]} */
63
+ const scoredRows = []
64
+ for (let i = 0; i < blockRows.length; i++) {
65
+ const row = blockRows[i]
66
+ const score = scoreRow(row, textColumns, queryTerms, block.terms, prefix)
67
+ if (score > 0) {
68
+ scoredRows.push({ index: block.rowStart + i, row, score })
69
+ }
70
+ }
71
+
72
+ // Sort by score descending within block
73
+ scoredRows.sort((a, b) => b.score - a.score)
74
+
75
+ // Yield rows in score order
76
+ for (const { index, row } of scoredRows) {
77
+ yield { __index__: index, ...row }
78
+ if (++count >= limit) return
79
+ }
80
+ }
81
+ }
82
+
83
+ /**
84
+ * Score a row based on which query terms it matches, weighted by IDF.
85
+ *
86
+ * @param {Record<string, any>} row
87
+ * @param {string[]} textColumns
88
+ * @param {Set<string>} queryTerms
89
+ * @param {TermResults} termStats
90
+ * @param {boolean} prefix
91
+ * @returns {number} score (0 if no match)
92
+ */
93
+ function scoreRow(row, textColumns, queryTerms, termStats, prefix) {
94
+ let score = 0
95
+ const rowTokens = new Set()
96
+
97
+ // Collect all tokens from text columns
98
+ for (const col of textColumns) {
99
+ const value = row[col]
100
+ if (typeof value === 'string') {
101
+ for (const token of tokenize(value)) {
102
+ rowTokens.add(token)
103
+ }
104
+ }
105
+ }
106
+
107
+ // Score based on matching query terms weighted by IDF
108
+ for (const queryTerm of queryTerms) {
109
+ if (prefix) {
110
+ // Prefix matching: find row tokens that start with query term
111
+ for (const token of rowTokens) {
112
+ if (token.startsWith(queryTerm)) {
113
+ // Use the matched token's stats from the index
114
+ const stats = termStats[token]
115
+ score += stats?.idf ?? 1
116
+ }
117
+ }
118
+ } else {
119
+ // Exact matching
120
+ if (rowTokens.has(queryTerm)) {
121
+ const stats = termStats[queryTerm]
122
+ score += stats?.idf ?? 1
123
+ }
124
+ }
125
+ }
126
+
127
+ return score
128
+ }
@@ -0,0 +1,171 @@
1
+ import { parquetMetadataAsync, parquetQuery } from 'hyparquet'
2
+ import { defaultBlockSize, hypGrepVersion } from './constants.js'
3
+ import { tokenize } from './tokenize.js'
4
+
5
+ /**
6
+ * @import { FileMetaData, KeyValue, ParquetQueryFilter } from 'hyparquet'
7
+ * @import { BlockResult, HypGrepMetadata, QueryIndexOptions, QueryResult, TermResults } from './types.js'
8
+ */
9
+
10
+ /**
11
+ * Build a pushdown filter to efficiently query for terms in the index.
12
+ * Optionally uses prefix matching.
13
+ *
14
+ * @param {string[]} terms
15
+ * @param {boolean} prefix - whether to use prefix matching
16
+ * @returns {ParquetQueryFilter}
17
+ */
18
+ function termsFilter(terms, prefix) {
19
+ if (prefix) {
20
+ const $or = terms.map(t => {
21
+ const lastChar = t.charCodeAt(t.length - 1)
22
+ const upperBound = t.slice(0, -1) + String.fromCharCode(lastChar + 1)
23
+ return { term: { $gte: t, $lt: upperBound } }
24
+ })
25
+ return { $or }
26
+ } else {
27
+ return { term: { $in: terms } }
28
+ }
29
+ }
30
+
31
+ /**
32
+ * Query a search index to find matching row groups from the source parquet.
33
+ * Returns undefined if query is empty so the search index is not used.
34
+ *
35
+ * @param {QueryIndexOptions} options
36
+ * @returns {Promise<QueryResult | undefined>}
37
+ */
38
+ export async function queryIndex({ query, indexFile, indexMetadata, prefix = true }) {
39
+ // Tokenize the query using the same logic as indexing
40
+ const queryTerms = tokenize(query)
41
+ if (queryTerms.length === 0) return undefined
42
+
43
+ // Read index kv metadata
44
+ indexMetadata ??= await parquetMetadataAsync(indexFile)
45
+ const kvMetadata = indexMetadata.key_value_metadata || []
46
+ const { blockSize, textColumns, sourceByteLength, sourceRows } = parseKvMetadata(kvMetadata)
47
+
48
+ // Read index rows matching any of the query terms
49
+ const indexRows = await parquetQuery({
50
+ file: indexFile,
51
+ metadata: indexMetadata,
52
+ // use hyparquet pushdown filtering
53
+ filter: termsFilter(queryTerms, prefix),
54
+ })
55
+
56
+ // Pre-compute corpusDocFreq by summing docCount per term
57
+ /** @type {Map<string, number>} */
58
+ const corpusDocFreq = new Map()
59
+ for (const row of indexRows) {
60
+ const prev = corpusDocFreq.get(row.term) || 0
61
+ corpusDocFreq.set(row.term, prev + row.docCount)
62
+ }
63
+
64
+ // Map to accumulate scores per blockId
65
+ /** @type {Map<number, number>} */
66
+ const blockScores = new Map()
67
+ // Map to accumulate term statistics per blockId
68
+ /** @type {Map<number, TermResults>} */
69
+ const blockTerms = new Map()
70
+
71
+ // For each query term, find matching blocks and accumulate scores
72
+ for (const queryTerm of queryTerms) {
73
+ for (const indexRow of indexRows) {
74
+ // Check if this index term matches (exact or prefix)
75
+ const matches = prefix
76
+ ? indexRow.term.startsWith(queryTerm)
77
+ : indexRow.term === queryTerm
78
+ if (matches) {
79
+ const currentScore = blockScores.get(indexRow.blockId) || 0
80
+
81
+ // Use actual index term's corpus doc freq for scoring
82
+ const termCorpusDocFreq = corpusDocFreq.get(indexRow.term) || 0
83
+
84
+ // BM25 scoring
85
+ // IDF component: log((N - df + 0.5) / (df + 0.5) + 1)
86
+ const idf = Math.log((sourceRows - termCorpusDocFreq + 0.5) / (termCorpusDocFreq + 0.5) + 1)
87
+
88
+ // BM25 parameters
89
+ const k1 = 1.2 // controls term frequency saturation
90
+ const b = 0.75 // controls length normalization
91
+
92
+ // TF component with saturation and length normalization
93
+ const tf = indexRow.termFreq
94
+ const tfComponent = tf * (k1 + 1) / (tf + k1 * (1 - b + b * indexRow.docCount / blockSize))
95
+
96
+ const termScore = idf * tfComponent
97
+
98
+ blockScores.set(indexRow.blockId, currentScore + termScore)
99
+
100
+ // Collect term statistics
101
+ if (!blockTerms.has(indexRow.blockId)) {
102
+ blockTerms.set(indexRow.blockId, {})
103
+ }
104
+ const terms = blockTerms.get(indexRow.blockId)
105
+ if (!terms) continue
106
+ terms[indexRow.term] = {
107
+ docs: indexRow.docCount,
108
+ frequency: indexRow.termFreq,
109
+ idf,
110
+ }
111
+ }
112
+ }
113
+ }
114
+
115
+ // Convert block scores to BlockResults
116
+ /** @type {BlockResult[]} */
117
+ const blocks = []
118
+ const numRows = Number(indexMetadata.num_rows)
119
+ for (const [blockId, score] of blockScores.entries()) {
120
+ const rowStart = blockId * blockSize
121
+ const rowEnd = Math.min((blockId + 1) * blockSize, numRows)
122
+ const terms = blockTerms.get(blockId) || {}
123
+ blocks.push({ blockId, rowStart, rowEnd, score, terms })
124
+ }
125
+
126
+ return { blocks, textColumns, sourceByteLength }
127
+ }
128
+
129
+ /**
130
+ * Parse key-value metadata from the index file
131
+ *
132
+ * @param {KeyValue[]} kvMetadata
133
+ * @returns {HypGrepMetadata}
134
+ */
135
+ export function parseKvMetadata(kvMetadata) {
136
+ let blockSize = defaultBlockSize
137
+ /** @type {string[]} */
138
+ let textColumns = []
139
+ /** @type {number | undefined} */
140
+ let sourceByteLength
141
+ /** @type {number | undefined} */
142
+ let sourceRows
143
+
144
+ for (const { key, value } of kvMetadata) {
145
+ if (key === 'hypgrep.block_size') {
146
+ blockSize = Number(value)
147
+ }
148
+ if (key === 'hypgrep.version') {
149
+ if (Number(value) !== hypGrepVersion) {
150
+ throw new Error(`Unsupported hypgrep version ${value}`)
151
+ }
152
+ }
153
+ if (key === 'hypgrep.text_columns' && value) {
154
+ textColumns = value.split(',')
155
+ }
156
+ if (key === 'hypgrep.source_rows') {
157
+ sourceRows = Number(value)
158
+ }
159
+ if (key === 'hypgrep.source_bytelength') {
160
+ sourceByteLength = Number(value)
161
+ }
162
+ }
163
+ if (sourceRows === undefined) {
164
+ throw new Error('Missing hypgrep.source_rows in index metadata')
165
+ }
166
+ if (sourceByteLength === undefined) {
167
+ throw new Error('Missing hypgrep.source_bytelength in index metadata')
168
+ }
169
+
170
+ return { blockSize, textColumns, sourceByteLength, sourceRows }
171
+ }
package/src/stemmer.js ADDED
@@ -0,0 +1,78 @@
1
+
2
+ const vowels = 'aeiouy'
3
+
4
+ // ordered longest-first
5
+ const suffixRules = [
6
+ { suffix: 'ing', minStem: 4, needsVowel: true },
7
+ { suffix: 'ed', minStem: 3, needsVowel: true },
8
+ { suffix: 'ly', minStem: 3 },
9
+ // handle plural-ish endings carefully
10
+ { suffix: 'es', minStem: 3, needsVowel: true, plural: true },
11
+ { suffix: 's', minStem: 3, needsVowel: true, plural: true },
12
+ // bad with i/y: er, ers, est, ies
13
+ // other options: ment, ingly, edly, ness
14
+ ]
15
+
16
+ /**
17
+ * Simple prefix stemmer, removes common English suffixes.
18
+ * Based on a simplified version of the Porter stemming algorithm.
19
+ * Importantly, only removes suffixes.
20
+ *
21
+ * @param {string} term - lowercase word to stem
22
+ * @returns {string} stemmed word
23
+ */
24
+ export function stemmer(term) {
25
+ // too short to bother
26
+ if (term.length < 4) return term
27
+
28
+ // skip anything that isn't a simple lowercase word
29
+ if (!isLowerAlpha(term)) return term
30
+
31
+ for (let i = 0; i < suffixRules.length; i += 1) {
32
+ const rule = suffixRules[i]
33
+ const { suffix } = rule
34
+
35
+ if (!term.endsWith(suffix)) continue
36
+
37
+ const stem = term.slice(0, term.length - suffix.length)
38
+ if (stem.length < rule.minStem) continue
39
+
40
+ if (rule.needsVowel && !hasVowel(stem)) continue
41
+
42
+ if (rule.plural) {
43
+ // fix: class, boss
44
+ if (term.endsWith('ss')) continue
45
+ // fix: virus, status
46
+ if (term.endsWith('us')) continue
47
+ // fix: this, analysis
48
+ if (term.endsWith('is')) continue
49
+ }
50
+
51
+ return stem
52
+ }
53
+
54
+ return term
55
+ }
56
+
57
+ /**
58
+ * @param {string} s
59
+ * @returns {boolean}
60
+ */
61
+ function hasVowel(s) {
62
+ for (let i = 0; i < s.length; i += 1) {
63
+ if (vowels.includes(s[i])) return true
64
+ }
65
+ return false
66
+ }
67
+
68
+ /**
69
+ * @param {string} s
70
+ * @returns {boolean}
71
+ */
72
+ function isLowerAlpha(s) {
73
+ for (let i = 0; i < s.length; i += 1) {
74
+ const code = s.charCodeAt(i)
75
+ if (code < 97 || code > 122) return false
76
+ }
77
+ return true
78
+ }
@@ -0,0 +1,72 @@
1
+ import { stemmer } from './stemmer.js'
2
+
3
+ /**
4
+ * Common English stop words to filter from index.
5
+ * These high-frequency, low-value words are excluded to reduce index size.
6
+ */
7
+ const STOP_WORDS = new Set([
8
+ 'the', 'be', 'to', 'of', 'and', 'a', 'in', 'that', 'have', 'it',
9
+ 'for', 'not', 'on', 'with', 'he', 'as', 'you', 'do', 'at', 'this',
10
+ 'but', 'his', 'by', 'from', 'they', 'we', 'say', 'her', 'she', 'or',
11
+ 'an', 'will', 'my', 'one', 'all', 'would', 'there', 'their', 'what',
12
+ 'so', 'up', 'out', 'if', 'about', 'who', 'get', 'which', 'go', 'me',
13
+ 'when', 'make', 'can', 'like', 'no', 'just', 'him', 'know', 'take',
14
+ 'into', 'your', 'some', 'could', 'them', 'than', 'then', 'now', 'only',
15
+ 'its', 'also', 'other', 'how', 'our', 'may', 'these', 'was', 'been',
16
+ 'has', 'had', 'are', 'is', 'am', 'were', 'does', 'did', 'being',
17
+ ])
18
+
19
+ /**
20
+ * Split camelCase and PascalCase words by inserting spaces before uppercase letters.
21
+ * Converts "parseUserInput" to "parse User Input", "XMLParser" to "XMLParser", etc.
22
+ *
23
+ * @param {string} text
24
+ * @returns {string}
25
+ */
26
+ function splitCamelCase(text) {
27
+ // Insert space before uppercase letters that follow lowercase letters or digits
28
+ return text.replace(/([a-z0-9])([A-Z])/g, '$1 $2')
29
+ }
30
+
31
+ /**
32
+ * Normalize Unicode text by removing diacritics/accents.
33
+ * Converts "café" to "cafe", "résumé" to "resume", etc.
34
+ *
35
+ * @param {string} text
36
+ * @returns {string}
37
+ */
38
+ function normalizeUnicode(text) {
39
+ // NFD decomposes combined characters into base + combining marks
40
+ // Then remove combining marks (Unicode category: Mark, Nonspacing)
41
+ return text.normalize('NFD').replace(/[\u0300-\u036f]/g, '')
42
+ }
43
+
44
+ /**
45
+ * Tokenize text into normalized terms.
46
+ * Splits camelCase, lowercases, normalizes Unicode, splits on non-alphanumeric boundaries,
47
+ * filters stop words, and applies Porter stemming.
48
+ *
49
+ * @param {string} text
50
+ * @returns {string[]}
51
+ */
52
+ export function tokenize(text) {
53
+ // Split camelCase/PascalCase before normalization
54
+ const split = splitCamelCase(text)
55
+ // Normalize Unicode (remove accents) before lowercasing
56
+ const normalized = normalizeUnicode(split)
57
+ const lower = normalized.toLowerCase()
58
+
59
+ // Split on non-alphanumeric boundaries
60
+ const rawTokens = lower.split(/[^a-z0-9]+/g)
61
+ const tokens = []
62
+
63
+ for (const token of rawTokens) {
64
+ if (!token) continue
65
+ if (token.length < 2) continue
66
+ if (STOP_WORDS.has(token)) continue
67
+ // Apply Porter stemming to reduce words to their root form
68
+ tokens.push(stemmer(token))
69
+ }
70
+
71
+ return tokens
72
+ }
package/src/types.d.ts ADDED
@@ -0,0 +1,92 @@
1
+ import type { AsyncBuffer, asyncBufferFromUrl, Compressors, FileMetaData, ParquetParsers, ParquetQueryFilter } from 'hyparquet'
2
+ import type { Writer } from 'hyparquet-writer'
3
+
4
+ export interface CreateIndexOptions {
5
+ sourceFile: AsyncBuffer // file reader for the source parquet file
6
+ sourceMetadata?: FileMetaData // optional source parquet metadata
7
+ indexFile: Writer // file writer for the output index parquet file
8
+ blockSize?: number // number of rows per logical block
9
+ indexRowGroupSize?: number // row group size in the index file
10
+ }
11
+
12
+ export interface QueryIndexOptions {
13
+ query: string // the search query string
14
+ indexFile: AsyncBuffer // file reader for the index parquet file
15
+ indexMetadata?: FileMetaData // optional index parquet metadata
16
+ prefix?: boolean // enable prefix matching (default: true)
17
+ }
18
+
19
+ export interface ParquetSearchOptions {
20
+ query: string // the search query string
21
+ url: string // URL or file path to the source parquet file
22
+ limit?: number // maximum number of matching rows to return
23
+ prefix?: boolean // enable prefix matching (default: true)
24
+
25
+ // fetch options
26
+ signal?: AbortSignal // optional AbortSignal to cancel the search operation
27
+ asyncBufferFactory?: typeof asyncBufferFromUrl // optional factory to create AsyncBuffers for source and index files
28
+ sourceFile?: AsyncBuffer // file reader for the source parquet file
29
+ sourceMetadata?: FileMetaData // optional source parquet metadata
30
+ indexFile?: AsyncBuffer // file reader for the index parquet file
31
+ indexMetadata?: FileMetaData // optional index parquet metadata
32
+
33
+ // misc options passed through to hyparquet
34
+ columns?: string[]
35
+ filter?: ParquetQueryFilter
36
+ compressors?: Compressors
37
+ utf8?: boolean
38
+ parsers?: ParquetParsers
39
+ }
40
+
41
+ /**
42
+ * Represents a single entry in the search index.
43
+ */
44
+ export interface IndexRow {
45
+ term: string // normalized search term
46
+ blockId: number // logical block ID this term appears in
47
+ docCount: number // number of documents in the block containing this term
48
+ termFreq: number // total frequency of the term in the block
49
+ }
50
+
51
+ export interface QueryResult {
52
+ blocks: BlockResult[] // list of matching blocks
53
+ textColumns: string[] // list of indexed text columns in the source parquet
54
+ sourceByteLength: number // byte length of the source parquet file
55
+ }
56
+
57
+ /**
58
+ * Represents a matching block of rows from the source parquet.
59
+ */
60
+ export interface BlockResult {
61
+ blockId: number
62
+ rowStart: number // starting row index (inclusive) in the source parquet
63
+ rowEnd: number // ending row index (exclusive) in the source parquet
64
+ score: number // relevance score based on term frequency
65
+ terms: TermResults // per-term statistics
66
+ }
67
+
68
+ export type TermResults = Record<string, TermResult>
69
+ interface TermResult {
70
+ docs: number // number of documents in the block containing this term
71
+ frequency: number // total occurrences of the term in the block
72
+ idf: number // inverse document frequency for this term
73
+ }
74
+
75
+ /**
76
+ * Metadata about the source parquet file and index settings.
77
+ * Parsed from the index file KV metadata
78
+ */
79
+ export interface HypGrepMetadata {
80
+ blockSize: number // number of rows per logical block
81
+ textColumns: string[] // list of indexed text columns
82
+ sourceRows: number // number of rows in the source parquet file
83
+ sourceByteLength: number // byte length of the source parquet file
84
+ }
85
+
86
+ /**
87
+ * Statistics collected for a single logical block during index creation.
88
+ */
89
+ export interface BlockStats {
90
+ termDocCount: Map<string, number> // number of documents containing each term
91
+ termFreqMap: Map<string, number> // total frequency of each term in the block
92
+ }
package/src/utils.js ADDED
@@ -0,0 +1,44 @@
1
+ import { parquetSchema } from 'hyparquet'
2
+
3
+ /**
4
+ * @import {FileMetaData, SchemaTree} from 'hyparquet'
5
+ */
6
+
7
+ /**
8
+ * Get string column names from the parquet schema.
9
+ *
10
+ * @param {FileMetaData} metadata
11
+ * @returns {string[]}
12
+ */
13
+ export function getTextColumnsFromSchema(metadata) {
14
+ const schemaTree = parquetSchema(metadata)
15
+ /** @type {string[]} */
16
+ const textColumns = []
17
+
18
+ /**
19
+ * @param {SchemaTree} node
20
+ */
21
+ function traverse(node) {
22
+ const { element, children } = node
23
+
24
+ // Check if this is a string column
25
+ const isString =
26
+ element.converted_type === 'UTF8' ||
27
+ element.logical_type?.type === 'STRING'
28
+
29
+ // If it's a leaf node (no children) and it's a string, add it
30
+ if (isString && (!children || children.length === 0)) {
31
+ textColumns.push(element.name)
32
+ }
33
+
34
+ // Traverse children
35
+ if (children) {
36
+ for (const child of children) {
37
+ traverse(child)
38
+ }
39
+ }
40
+ }
41
+
42
+ traverse(schemaTree)
43
+ return textColumns
44
+ }