tablassist 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,10 @@
1
+ # Python-generated files
2
+ __pycache__/
3
+ *.py[oc]
4
+ build/
5
+ dist/
6
+ wheels/
7
+ *.egg-info
8
+
9
+ # Virtual environments
10
+ .venv
@@ -0,0 +1,159 @@
1
+ Metadata-Version: 2.4
2
+ Name: tablassist
3
+ Version: 0.1.0
4
+ Summary: AI-assisted table configuration generation for Tablassert — entity resolution, YAML validation, and Biolink documentation lookup.
5
+ Project-URL: Homepage, https://github.com/SkyeAv/Tablassist
6
+ Project-URL: Source, https://github.com/SkyeAv/Tablassist
7
+ Author-email: Skye Lane Goetz <sgoetz@isbscience.org>
8
+ License-Expression: Apache-2.0
9
+ Keywords: bioinformatics,biolink,data quality control,entity resolution,knowledge graph,ncats translator,tablassert,tablassist,table mining,yaml configuration
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: Environment :: Console
12
+ Classifier: Framework :: Pydantic
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: Intended Audience :: Healthcare Industry
15
+ Classifier: Intended Audience :: Science/Research
16
+ Classifier: License :: OSI Approved :: Apache Software License
17
+ Classifier: Operating System :: OS Independent
18
+ Classifier: Programming Language :: Python :: 3.13
19
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
20
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
21
+ Requires-Python: >=3.13
22
+ Requires-Dist: cyclopts>=4.10.1
23
+ Requires-Dist: fastexcel>=0.19.0
24
+ Requires-Dist: httpx>=0.28.1
25
+ Requires-Dist: polars>=1.39.3
26
+ Requires-Dist: pydantic>=2.12.5
27
+ Requires-Dist: pyyaml>=6.0.3
28
+ Requires-Dist: tablassert>=7.2.1
29
+ Requires-Dist: textract>=1.6.5
30
+ Requires-Dist: trafilatura>=2.0.0
31
+ Provides-Extra: rt
32
+ Requires-Dist: polars[rtcompat]>=1.39.0; extra == 'rt'
33
+ Requires-Dist: tablassert[rtcompat]>=7.2.1; extra == 'rt'
34
+ Provides-Extra: rtcompat
35
+ Requires-Dist: polars[rtcompat]>=1.39.0; extra == 'rtcompat'
36
+ Requires-Dist: tablassert[rtcompat]>=7.2.1; extra == 'rtcompat'
37
+ Description-Content-Type: text/markdown
38
+
39
+ # Tablassist CLI
40
+
41
+ [![PyPI](https://img.shields.io/pypi/v/tablassist.svg)](https://pypi.org/project/tablassist/)
42
+ [![Python](https://img.shields.io/pypi/pyversions/tablassist.svg)](https://pypi.org/project/tablassist/)
43
+ [![License](https://img.shields.io/pypi/l/tablassist.svg)](https://github.com/SkyeAv/Tablassist/blob/master/LICENSE)
44
+
45
+ Python CLI tool for AI-assisted [Tablassert](https://github.com/SkyeAv/Tablassert) table configuration generation — entity resolution, YAML validation, and Biolink documentation lookup.
46
+
47
+ ## Installation
48
+
49
+ ```bash
50
+ pip install tablassist
51
+ ```
52
+
53
+ An optional extra is available for CPU compatibility:
54
+
55
+ ```bash
56
+ pip install "tablassist[rtcompat]" # Polars build for CPUs without required instructions
57
+ ```
58
+
59
+ ### Requirements
60
+
61
+ - Python >= 3.13
62
+ - Environment variables `TABLASSIST_USERNAME` and `TABLASSIST_API_KEY` for API-accessing commands
63
+
64
+ ## Usage
65
+
66
+ ```bash
67
+ # Fetch table configuration documentation
68
+ tablassist docs-table-config
69
+
70
+ # Fetch advanced configuration examples
71
+ tablassist docs-advanced-examples
72
+
73
+ # Fetch the CLI tutorial
74
+ tablassist docs-tutorial
75
+ ```
76
+
77
+ ### Entity Resolution
78
+
79
+ ```bash
80
+ # Search for entity CURIEs by term
81
+ tablassist search-curies "breast cancer"
82
+
83
+ # Get canonical info for a specific CURIE
84
+ tablassist get-curie-info "MONDO:0007254"
85
+
86
+ # Search gene CURIEs within an NCBI taxon
87
+ tablassist search-gene-curies "BRCA1" --ncbi-taxon 9606
88
+
89
+ # Resolve an NCBI Taxon ID from an organism name
90
+ tablassist resolve-taxon-id "Homo sapiens"
91
+ ```
92
+
93
+ ### Biolink Reference
94
+
95
+ ```bash
96
+ # List all supported categories, predicates, or qualifiers
97
+ tablassist list-categories
98
+ tablassist list-predicates
99
+ tablassist list-qualifiers
100
+
101
+ # Fetch documentation for a specific Biolink element
102
+ tablassist docs-category "Gene"
103
+ tablassist docs-predicate "interacts_with"
104
+ tablassist docs-qualifier "qualified_predicate"
105
+ ```
106
+
107
+ ### YAML Validation
108
+
109
+ ```bash
110
+ # Validate a full config file
111
+ tablassist validate-config-file config.yaml
112
+
113
+ # Validate a single section from a YAML string
114
+ tablassist validate-section-str '<yaml>'
115
+
116
+ # Validate a full config from a YAML string
117
+ tablassist validate-config-str '<yaml>'
118
+
119
+ # Get the Section JSON schema
120
+ tablassist section-schema
121
+ ```
122
+
123
+ ### Data Preview
124
+
125
+ ```bash
126
+ # List sheets in an Excel file
127
+ tablassist excel-sheets data.xlsx
128
+
129
+ # Preview rows from an Excel sheet
130
+ tablassist preview-excel data.xlsx "Sheet1" 10
131
+
132
+ # Preview rows from a CSV file
133
+ tablassist preview-csv data.csv 10
134
+
135
+ # Extract text from a document (PDF, DOCX, etc.)
136
+ tablassist extract-text document.pdf
137
+ ```
138
+
139
+ ### PMC Archive Download
140
+
141
+ ```bash
142
+ # Download and extract a PMC tar archive
143
+ tablassist download-pmc-tar 12345 --dest-dir ./output
144
+ ```
145
+
146
+ ## Development
147
+
148
+ ```bash
149
+ uv sync # install dependencies
150
+ uv run ruff check . # lint
151
+ uv run ruff check --fix . # lint with auto-fix
152
+ uv run ruff format . # format
153
+ uv run pyright # type check
154
+ uv run pytest # run all tests
155
+ ```
156
+
157
+ ## License
158
+
159
+ [Apache License 2.0](../LICENSE)
@@ -0,0 +1,121 @@
1
+ # Tablassist CLI
2
+
3
+ [![PyPI](https://img.shields.io/pypi/v/tablassist.svg)](https://pypi.org/project/tablassist/)
4
+ [![Python](https://img.shields.io/pypi/pyversions/tablassist.svg)](https://pypi.org/project/tablassist/)
5
+ [![License](https://img.shields.io/pypi/l/tablassist.svg)](https://github.com/SkyeAv/Tablassist/blob/master/LICENSE)
6
+
7
+ Python CLI tool for AI-assisted [Tablassert](https://github.com/SkyeAv/Tablassert) table configuration generation — entity resolution, YAML validation, and Biolink documentation lookup.
8
+
9
+ ## Installation
10
+
11
+ ```bash
12
+ pip install tablassist
13
+ ```
14
+
15
+ An optional extra is available for CPU compatibility:
16
+
17
+ ```bash
18
+ pip install "tablassist[rtcompat]" # Polars build for CPUs without required instructions
19
+ ```
20
+
21
+ ### Requirements
22
+
23
+ - Python >= 3.13
24
+ - Environment variables `TABLASSIST_USERNAME` and `TABLASSIST_API_KEY` for API-accessing commands
25
+
26
+ ## Usage
27
+
28
+ ```bash
29
+ # Fetch table configuration documentation
30
+ tablassist docs-table-config
31
+
32
+ # Fetch advanced configuration examples
33
+ tablassist docs-advanced-examples
34
+
35
+ # Fetch the CLI tutorial
36
+ tablassist docs-tutorial
37
+ ```
38
+
39
+ ### Entity Resolution
40
+
41
+ ```bash
42
+ # Search for entity CURIEs by term
43
+ tablassist search-curies "breast cancer"
44
+
45
+ # Get canonical info for a specific CURIE
46
+ tablassist get-curie-info "MONDO:0007254"
47
+
48
+ # Search gene CURIEs within an NCBI taxon
49
+ tablassist search-gene-curies "BRCA1" --ncbi-taxon 9606
50
+
51
+ # Resolve an NCBI Taxon ID from an organism name
52
+ tablassist resolve-taxon-id "Homo sapiens"
53
+ ```
54
+
55
+ ### Biolink Reference
56
+
57
+ ```bash
58
+ # List all supported categories, predicates, or qualifiers
59
+ tablassist list-categories
60
+ tablassist list-predicates
61
+ tablassist list-qualifiers
62
+
63
+ # Fetch documentation for a specific Biolink element
64
+ tablassist docs-category "Gene"
65
+ tablassist docs-predicate "interacts_with"
66
+ tablassist docs-qualifier "qualified_predicate"
67
+ ```
68
+
69
+ ### YAML Validation
70
+
71
+ ```bash
72
+ # Validate a full config file
73
+ tablassist validate-config-file config.yaml
74
+
75
+ # Validate a single section from a YAML string
76
+ tablassist validate-section-str '<yaml>'
77
+
78
+ # Validate a full config from a YAML string
79
+ tablassist validate-config-str '<yaml>'
80
+
81
+ # Get the Section JSON schema
82
+ tablassist section-schema
83
+ ```
84
+
85
+ ### Data Preview
86
+
87
+ ```bash
88
+ # List sheets in an Excel file
89
+ tablassist excel-sheets data.xlsx
90
+
91
+ # Preview rows from an Excel sheet
92
+ tablassist preview-excel data.xlsx "Sheet1" 10
93
+
94
+ # Preview rows from a CSV file
95
+ tablassist preview-csv data.csv 10
96
+
97
+ # Extract text from a document (PDF, DOCX, etc.)
98
+ tablassist extract-text document.pdf
99
+ ```
100
+
101
+ ### PMC Archive Download
102
+
103
+ ```bash
104
+ # Download and extract a PMC tar archive
105
+ tablassist download-pmc-tar 12345 --dest-dir ./output
106
+ ```
107
+
108
+ ## Development
109
+
110
+ ```bash
111
+ uv sync # install dependencies
112
+ uv run ruff check . # lint
113
+ uv run ruff check --fix . # lint with auto-fix
114
+ uv run ruff format . # format
115
+ uv run pyright # type check
116
+ uv run pytest # run all tests
117
+ ```
118
+
119
+ ## License
120
+
121
+ [Apache License 2.0](../LICENSE)
@@ -0,0 +1,99 @@
1
+ [project]
2
+ name = "tablassist"
3
+ version = "0.1.0"
4
+ description = "AI-assisted table configuration generation for Tablassert — entity resolution, YAML validation, and Biolink documentation lookup."
5
+ authors = [
6
+ { name = "Skye Lane Goetz", email = "sgoetz@isbscience.org" },
7
+ ]
8
+ keywords = [
9
+ "knowledge graph",
10
+ "bioinformatics",
11
+ "entity resolution",
12
+ "ncats translator",
13
+ "yaml configuration",
14
+ "table mining",
15
+ "tablassert",
16
+ "tablassist",
17
+ "biolink",
18
+ "data quality control",
19
+ ]
20
+ readme = "README.md"
21
+ license = "Apache-2.0"
22
+ classifiers = [
23
+ "License :: OSI Approved :: Apache Software License",
24
+ "Development Status :: 4 - Beta",
25
+ "Intended Audience :: Science/Research",
26
+ "Intended Audience :: Healthcare Industry",
27
+ "Intended Audience :: Developers",
28
+ "Topic :: Scientific/Engineering :: Bio-Informatics",
29
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
30
+ "Programming Language :: Python :: 3.13",
31
+ "Framework :: Pydantic",
32
+ "Operating System :: OS Independent",
33
+ "Environment :: Console",
34
+ ]
35
+ requires-python = ">=3.13"
36
+ dependencies = [
37
+ "cyclopts>=4.10.1",
38
+ "fastexcel>=0.19.0",
39
+ "httpx>=0.28.1",
40
+ "polars>=1.39.3",
41
+ "pydantic>=2.12.5",
42
+ "pyyaml>=6.0.3",
43
+ "tablassert>=7.2.1",
44
+ "textract>=1.6.5",
45
+ "trafilatura>=2.0.0",
46
+ ]
47
+
48
+ [project.urls]
49
+ Homepage = "https://github.com/SkyeAv/Tablassist"
50
+ Source = "https://github.com/SkyeAv/Tablassist"
51
+
52
+ [build-system]
53
+ requires = [
54
+ "hatchling",
55
+ ]
56
+ build-backend = "hatchling.build"
57
+
58
+ [tool.hatch.build.targets.wheel]
59
+ packages = [
60
+ "./src/tablassist",
61
+ ]
62
+
63
+ [project.scripts]
64
+ tablassist = "tablassist.cli:serve"
65
+
66
+ [project.optional-dependencies]
67
+ rtcompat = [
68
+ "tablassert[rtcompat]>=7.2.1",
69
+ "polars[rtcompat]>=1.39.0",
70
+ ]
71
+ rt = [
72
+ "tablassist[rtcompat]",
73
+ ]
74
+
75
+ [tool.uv]
76
+ override-dependencies = [
77
+ "six>=1.16.0",
78
+ ]
79
+
80
+ [dependency-groups]
81
+ dev = [
82
+ "mkdocs>=1.6.1",
83
+ "pyright>=1.1.408",
84
+ "pytest>=9.0.2",
85
+ "ruff>=0.15.6",
86
+ ]
87
+
88
+ [tool.pytest.ini_options]
89
+ testpaths = ["test"]
90
+
91
+ [tool.ruff]
92
+ line-length = 120
93
+ indent-width = 4
94
+ target-version = "py313"
95
+
96
+ [tool.ruff.format]
97
+ quote-style = "double"
98
+ indent-style = "space"
99
+ skip-magic-trailing-comma = true
File without changes
@@ -0,0 +1,268 @@
1
+ import os
2
+ import re
3
+ import subprocess
4
+ from pathlib import Path
5
+ from typing import Any, Literal, Optional, Union
6
+
7
+ import fastexcel
8
+ import httpx
9
+ import polars as pl
10
+ import textract
11
+ import yaml
12
+ from cyclopts import App
13
+ from tablassert.enums import Categories, Predicates, Qualifiers
14
+ from tablassert.ingests import from_yaml
15
+ from tablassert.models import Section
16
+
17
+ from tablassist.utils import (
18
+ get_biolink_html_documentation,
19
+ get_json_response,
20
+ get_static_content,
21
+ parse_yaml_string,
22
+ validate_section,
23
+ )
24
+
25
+ CLI: App = App()
26
+
27
+
28
+ @CLI.command
29
+ def docs_table_config() -> str:
30
+ """Fetch Tablassert table configuration spec documentation."""
31
+ url: str = "https://raw.githubusercontent.com/SkyeAv/Tablassert/main/docs/configuration/table.md"
32
+ return get_static_content(url)
33
+
34
+
35
+ @CLI.command
36
+ def docs_advanced_examples() -> str:
37
+ """Fetch advanced table configuration examples documentation."""
38
+ url: str = "https://raw.githubusercontent.com/SkyeAv/Tablassert/main/docs/configuration/advanced-example.md"
39
+ return get_static_content(url)
40
+
41
+
42
+ @CLI.command
43
+ def docs_tutorial() -> str:
44
+ """Fetch Tablassert CLI tutorial documentation."""
45
+ url: str = "https://raw.githubusercontent.com/SkyeAv/Tablassert/blob/main/docs/tutorial.md"
46
+ return get_static_content(url)
47
+
48
+
49
+ @CLI.command
50
+ def example_no_sections() -> str:
51
+ """Fetch a production YAML config example without sections."""
52
+ url: str = (
53
+ "https://raw.githubusercontent.com/glusman-team/MOKGConfiguration/refs/heads/master/TABLE/MBKG/ALAM1.yaml"
54
+ )
55
+ return get_static_content(url)
56
+
57
+
58
+ @CLI.command
59
+ def example_with_sections() -> str:
60
+ """Fetch a production YAML config example with sections."""
61
+ url: str = (
62
+ "https://raw.githubusercontent.com/glusman-team/MOKGConfiguration/refs/heads/master/TABLE/MBKG/BLANTON1.yaml"
63
+ )
64
+ return get_static_content(url)
65
+
66
+
67
+ TABLASSIST_USERNAME: str = os.environ.get("TABLASSIST_USERNAME", "")
68
+ TABLASSIST_API_KEY: str = os.environ.get("TABLASSIST_API_KEY", "")
69
+
70
+
71
+ @CLI.command
72
+ def search_curies(term: str) -> Union[list[Any], dict[str, Any]]:
73
+ """Search CURIE candidates by term via Configurator API."""
74
+ url: str = "https://hypatia.systemsbiology.net/configurator-api/search-for-curies"
75
+ params: dict[str, Any] = {"username": TABLASSIST_USERNAME, "api-key": TABLASSIST_API_KEY, "term": term}
76
+
77
+ return get_json_response(url, params)
78
+
79
+
80
+ @CLI.command
81
+ def get_curie_info(curie: str) -> Union[list[Any], dict[str, Any]]:
82
+ """Resolve a single canonical CURIE record."""
83
+ url: str = "https://hypatia.systemsbiology.net/configurator-api/get-canonical-curie-info"
84
+ params: dict[str, Any] = {"username": TABLASSIST_USERNAME, "api-key": TABLASSIST_API_KEY, "curie": curie}
85
+
86
+ return get_json_response(url, params)
87
+
88
+
89
+ @CLI.command
90
+ def download_pmc_tar(pmc_id: int, dest_dir: Path = Path(".")) -> dict[str, Any]:
91
+ """Download and extract a PMC tar archive by PMC ID."""
92
+ url: str = "https://hypatia.systemsbiology.net/configurator-api/download-from-pmc-tars"
93
+
94
+ params: dict[str, Any] = {"username": TABLASSIST_USERNAME, "api-key": TABLASSIST_API_KEY, "pmc-id": pmc_id}
95
+
96
+ with httpx.stream("GET", url, params=params) as r:
97
+ if r.status_code == 404:
98
+ error: dict[str, Any] = r.json()
99
+ return error
100
+
101
+ d: str = r.headers["content-disposition"]
102
+ matches: object = re.search(r"filename=(.+)", d)
103
+
104
+ filename: str = matches.group(1) if matches else "download.tar.xz"
105
+ p: Path = dest_dir / filename
106
+ with p.open("wb") as f:
107
+ for chunk in r.iter_bytes():
108
+ f.write(chunk)
109
+
110
+ cmd: list[str] = ["tar", "-xvf", f"{p}", "&&", "ls", "-lh", f"{dest_dir}"]
111
+ r: Any = subprocess.run(cmd, shell=True, capture_output=True, text=True)
112
+
113
+ return {"status": "ok", "stdout": r.stdout, "stderr": r.stderr}
114
+
115
+
116
+ @CLI.command
117
+ def search_gene_curies(term: str, ncbi_taxon: int = 9606) -> Union[list[Any], dict[str, Any]]:
118
+ """Search gene CURIE candidates by term within an NCBI taxon."""
119
+ url: str = "https://hypatia.systemsbiology.net/configurator-api/search-for-gene-curies-in-ncbi-taxon"
120
+ params: dict[str, Any] = {
121
+ "username": TABLASSIST_USERNAME,
122
+ "api-key": TABLASSIST_API_KEY,
123
+ "term": term,
124
+ "ncbi-taxon-id": ncbi_taxon,
125
+ }
126
+
127
+ return get_json_response(url, params)
128
+
129
+
130
+ @CLI.command
131
+ def resolve_taxon_id(organism_name: str) -> Union[list[Any], dict[str, Any]]:
132
+ """Resolve an NCBI Taxon ID from an organism name."""
133
+ url: str = "https://hypatia.systemsbiology.net/configurator-api/get-ncbi-taxon-id-from-organism-name"
134
+ params: dict[str, Any] = {
135
+ "username": TABLASSIST_USERNAME,
136
+ "api-key": TABLASSIST_API_KEY,
137
+ "organism-name": organism_name,
138
+ }
139
+
140
+ return get_json_response(url, params)
141
+
142
+
143
+ @CLI.command
144
+ def list_categories() -> list[str]:
145
+ """List all supported Biolink categories."""
146
+ return [x.value for x in Categories]
147
+
148
+
149
+ @CLI.command
150
+ def list_predicates() -> list[str]:
151
+ """List all supported Biolink predicates."""
152
+ return [x.value for x in Predicates]
153
+
154
+
155
+ @CLI.command
156
+ def list_qualifiers() -> list[str]:
157
+ """List all supported Biolink qualifiers."""
158
+ return [x.value for x in Qualifiers]
159
+
160
+
161
+ @CLI.command
162
+ def section_schema() -> dict[str, Any]:
163
+ """Return the Section Pydantic model as JSON schema."""
164
+ return Section.model_json_schema()
165
+
166
+
167
+ @CLI.command
168
+ def validate_section_str(yaml_string: str) -> dict[str, Any]:
169
+ """Validate a single YAML table configuration section from a string."""
170
+ raw: Any = parse_yaml_string(yaml_string)
171
+ if isinstance(raw, dict) and "error" in raw:
172
+ return raw
173
+
174
+ return validate_section(raw)
175
+
176
+
177
+ @CLI.command
178
+ def validate_config_str(yaml_string: str) -> Union[dict[str, Any], list[dict[str, Any]]]:
179
+ """Validate a full YAML table configuration from a string."""
180
+ raw: Any = parse_yaml_string(yaml_string)
181
+ if isinstance(raw, dict) and "error" in raw:
182
+ return raw
183
+
184
+ sections: list[dict[str, Any]] = raw if isinstance(raw, list) else [raw]
185
+
186
+ errors: list[dict[str, Any]] = []
187
+ for s in sections:
188
+ errors += [validate_section(s)]
189
+
190
+ return errors
191
+
192
+
193
+ @CLI.command
194
+ def validate_config_file(yaml_file: Path) -> Union[dict[str, Any], list[dict[str, Any]]]:
195
+ """Validate a full YAML table configuration from a file path."""
196
+ try:
197
+ raw: Any = from_yaml(yaml_file)
198
+ except yaml.scanner.ScannerError as e: # pyright: ignore
199
+ return {"error": f"YAML Syntax error at line {e.problem_mark.line + 1}: {e.problem}"}
200
+ except yaml.parser.ParserError as e: # pyright: ignore
201
+ return {"error": f"YAML Parser error: {e}"}
202
+ except yaml.YAMLError as e:
203
+ return {"error": f"YAML error: {e}"}
204
+
205
+ sections: list[dict[str, Any]] = raw if isinstance(raw, list) else [raw]
206
+
207
+ errors: list[dict[str, Any]] = []
208
+ for s in sections:
209
+ errors += [validate_section(s)]
210
+
211
+ return errors
212
+
213
+
214
+ @CLI.command
215
+ def docs_category(category: str) -> str:
216
+ """Fetch Biolink documentation for a specific category."""
217
+ return get_biolink_html_documentation(category) or f"ERROR | {category} is not a supported biolink category"
218
+
219
+
220
+ @CLI.command
221
+ def docs_predicate(predicate: str) -> str:
222
+ """Fetch Biolink documentation for a specific predicate."""
223
+ return get_biolink_html_documentation(predicate) or f"ERROR | {predicate} is not a supported biolink predicate"
224
+
225
+
226
+ @CLI.command
227
+ def docs_qualifier(qualifier: str) -> str:
228
+ """Fetch Biolink documentation for a specific qualifier."""
229
+ return get_biolink_html_documentation(qualifier) or f"ERROR | {qualifier} is not a supported biolink qualifier"
230
+
231
+
232
+ @CLI.command
233
+ def extract_text(file: Path, extension: Optional[str] = None) -> str:
234
+ """Extract text from a file using textract (PDF, DOCX, etc.)."""
235
+ if file.suffix == "pdf":
236
+ return textract.process(file, method="pdfminer")
237
+ elif extension:
238
+ return textract.process(file, extension=extension)
239
+ else:
240
+ return textract.process(file)
241
+
242
+
243
+ @CLI.command
244
+ def excel_sheets(file: Path) -> list[str]:
245
+ """List sheet names in an Excel spreadsheet."""
246
+ wb: Any = fastexcel.read_excel(file)
247
+ return wb.sheet_names
248
+
249
+
250
+ @CLI.command
251
+ def preview_excel(
252
+ file: Path, sheet_name: str, n_rows: int, engine: Literal["calamine", "openpyxl", "xlsx2csv"] = "calamine"
253
+ ) -> dict[str, Any]:
254
+ """Preview the first N rows of an Excel sheet as a dict."""
255
+ df: pl.DataFrame = pl.read_excel(source=file, sheet_name=sheet_name, engine=engine, infer_schema_length=None)
256
+ df = df.head(n_rows)
257
+ return df.to_dict(as_series=False)
258
+
259
+
260
+ @CLI.command
261
+ def preview_csv(file: Path, n_rows: int, separator: str = ",") -> dict[str, Any]:
262
+ """Preview the first N rows of a CSV/tabular file as a dict."""
263
+ df: pl.DataFrame = pl.read_csv(source=file, n_rows=n_rows, separator=separator)
264
+ return df.to_dict(as_series=False)
265
+
266
+
267
+ def serve() -> None:
268
+ CLI()