parselabs 0.1.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parselabs-0.1.2/.claude/skills/code-clarity/SKILL.md +184 -0
- parselabs-0.1.2/.claude-sandbox.Dockerfile +4 -0
- parselabs-0.1.2/.env.example +26 -0
- parselabs-0.1.2/.github/dependabot.yml +8 -0
- parselabs-0.1.2/.github/workflows/release.yml +38 -0
- parselabs-0.1.2/.gitignore +135 -0
- parselabs-0.1.2/.pre-commit-config.yaml +11 -0
- parselabs-0.1.2/AGENTS.md +318 -0
- parselabs-0.1.2/CLAUDE.md +1 -0
- parselabs-0.1.2/LICENSE +21 -0
- parselabs-0.1.2/Makefile +5 -0
- parselabs-0.1.2/PKG-INFO +243 -0
- parselabs-0.1.2/README.md +206 -0
- parselabs-0.1.2/REQUIREMENTS.md +85 -0
- parselabs-0.1.2/config/lab_specs.json +4729 -0
- parselabs-0.1.2/docs/pipeline.md +309 -0
- parselabs-0.1.2/logo.png +0 -0
- parselabs-0.1.2/main.py +1615 -0
- parselabs-0.1.2/parselabs/__init__.py +68 -0
- parselabs-0.1.2/parselabs/cli.py +18 -0
- parselabs-0.1.2/parselabs/config.py +412 -0
- parselabs-0.1.2/parselabs/exceptions.py +13 -0
- parselabs-0.1.2/parselabs/extraction.py +816 -0
- parselabs-0.1.2/parselabs/normalization.py +1307 -0
- parselabs-0.1.2/parselabs/standardization.py +143 -0
- parselabs-0.1.2/parselabs/utils.py +184 -0
- parselabs-0.1.2/parselabs/validation.py +683 -0
- parselabs-0.1.2/pipeline.md +228 -0
- parselabs-0.1.2/profiles/template.yaml.example +19 -0
- parselabs-0.1.2/prompts/conversion_factor_system.md +1 -0
- parselabs-0.1.2/prompts/conversion_factor_user.md +4 -0
- parselabs-0.1.2/prompts/extraction_system.md +160 -0
- parselabs-0.1.2/prompts/extraction_user.md +30 -0
- parselabs-0.1.2/prompts/health_range_system.md +1 -0
- parselabs-0.1.2/prompts/health_range_user.md +4 -0
- parselabs-0.1.2/prompts/name_standardization.md +54 -0
- parselabs-0.1.2/prompts/text_extraction_user.md +26 -0
- parselabs-0.1.2/prompts/unit_standardization.md +52 -0
- parselabs-0.1.2/pyproject.toml +64 -0
- parselabs-0.1.2/static/viewer.css +185 -0
- parselabs-0.1.2/static/viewer.js +30 -0
- parselabs-0.1.2/test.py +358 -0
- parselabs-0.1.2/utils/README.md +113 -0
- parselabs-0.1.2/utils/analyze_unknowns.py +359 -0
- parselabs-0.1.2/utils/cleanup_removed_fields.py +169 -0
- parselabs-0.1.2/utils/lab_specs_manager.py +412 -0
- parselabs-0.1.2/utils/migrate_output_dirs.py +176 -0
- parselabs-0.1.2/utils/migrate_raw_columns.py +191 -0
- parselabs-0.1.2/utils/update_standardization_caches.py +251 -0
- parselabs-0.1.2/utils/validate_lab_specs_schema.py +457 -0
- parselabs-0.1.2/uv.lock +2035 -0
- parselabs-0.1.2/viewer.py +1985 -0
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: code-clarity
|
|
3
|
+
description: Code quality and clarity guidelines for Python - enforces flat orchestrator patterns, explicit error handling, and mandatory comments
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# Code Clarity Skill
|
|
7
|
+
|
|
8
|
+
## Core Rules
|
|
9
|
+
|
|
10
|
+
### 1. Flat Orchestrator
|
|
11
|
+
|
|
12
|
+
Main functions own all flow: max 2 indent levels, early returns as guards, helpers extracted for deeper logic.
|
|
13
|
+
|
|
14
|
+
```python
|
|
15
|
+
def process_single_pdf(pdf_path: Path) -> tuple[Path | None, list[dict]]:
|
|
16
|
+
"""Process a single PDF: extract, standardize, save."""
|
|
17
|
+
# Initialize directory structure
|
|
18
|
+
doc_out_dir, csv_path, failed_pages = _setup_paths(pdf_path)
|
|
19
|
+
|
|
20
|
+
# Guard: Skip if no pages found
|
|
21
|
+
pages = _convert_to_images(pdf_path)
|
|
22
|
+
if not pages:
|
|
23
|
+
return None, []
|
|
24
|
+
|
|
25
|
+
# Extract lab data via vision model
|
|
26
|
+
try:
|
|
27
|
+
data = _extract_via_vision(pdf_path, pages)
|
|
28
|
+
except ExtractionError as e:
|
|
29
|
+
logger.error(f"Extraction failed: {e}")
|
|
30
|
+
return None, failed_pages
|
|
31
|
+
|
|
32
|
+
# Guard: No results extracted
|
|
33
|
+
if not data:
|
|
34
|
+
return _handle_empty_results(csv_path), failed_pages
|
|
35
|
+
|
|
36
|
+
# Normalize and save
|
|
37
|
+
_apply_standardization(data)
|
|
38
|
+
_save_results(data, csv_path)
|
|
39
|
+
return csv_path, failed_pages
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
**Rules embedded in this example:**
|
|
43
|
+
- **Max 2 indent levels** — `try` block is the deepest nesting
|
|
44
|
+
- **Early return over else** — guards exit early, remaining code stays flat
|
|
45
|
+
- **Extract helpers** — `_extract_via_vision`, `_apply_standardization` keep orchestrator readable
|
|
46
|
+
- **Verb-based helper names** — `_setup_paths`, `_convert_to_images`, `_extract_via_vision`
|
|
47
|
+
- **Read top-to-bottom** — all decision points visible in main flow
|
|
48
|
+
|
|
49
|
+
**When to extract a helper:**
|
|
50
|
+
- Indentation would reach level 3+
|
|
51
|
+
- Block exceeds 5-7 contiguous lines
|
|
52
|
+
- Logic is independently testable or reusable
|
|
53
|
+
|
|
54
|
+
### 2. Errors Propagate Up
|
|
55
|
+
|
|
56
|
+
Helpers raise exceptions. Orchestrators catch and decide.
|
|
57
|
+
|
|
58
|
+
```python
|
|
59
|
+
# BAD - Helper hides failure
|
|
60
|
+
def _fetch_data():
|
|
61
|
+
try:
|
|
62
|
+
return api.call()
|
|
63
|
+
except Exception:
|
|
64
|
+
return None # Caller can't distinguish failure from empty
|
|
65
|
+
|
|
66
|
+
# GOOD - Helper raises, orchestrator decides
|
|
67
|
+
def _fetch_data():
|
|
68
|
+
return api.call() # Let it raise
|
|
69
|
+
|
|
70
|
+
def process():
|
|
71
|
+
try:
|
|
72
|
+
data = _fetch_data()
|
|
73
|
+
except APIError as e:
|
|
74
|
+
logger.error(f"Fetch failed: {e}")
|
|
75
|
+
return None
|
|
76
|
+
return transform(data)
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
### 3. Comment Every Branch
|
|
80
|
+
|
|
81
|
+
Every guard clause, if/elif/else branch, and logical block gets a comment explaining intent.
|
|
82
|
+
|
|
83
|
+
```python
|
|
84
|
+
# Build and validate configuration
|
|
85
|
+
config, errors = build_config(args)
|
|
86
|
+
|
|
87
|
+
# Guard: Bail if validation failed
|
|
88
|
+
if errors:
|
|
89
|
+
return None, errors
|
|
90
|
+
|
|
91
|
+
# Check error type for appropriate response
|
|
92
|
+
if "401" in error_msg:
|
|
93
|
+
# Authentication failure - credentials invalid
|
|
94
|
+
return False, "Auth failed"
|
|
95
|
+
elif "timeout" in error_msg.lower():
|
|
96
|
+
# Server didn't respond in time
|
|
97
|
+
return False, "Server timeout"
|
|
98
|
+
else:
|
|
99
|
+
# Unknown error - fail safe
|
|
100
|
+
return False, "Unknown error"
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
**Mandatory check** — verify a comment exists before every:
|
|
104
|
+
- `return` / `continue` / `break` guard
|
|
105
|
+
- `if` / `elif` / `else` branch
|
|
106
|
+
- Logical block (group of related statements)
|
|
107
|
+
|
|
108
|
+
### 4. Blank Lines Between Blocks
|
|
109
|
+
|
|
110
|
+
Separate each comment-headed block with a blank line. This includes after the docstring.
|
|
111
|
+
|
|
112
|
+
```python
|
|
113
|
+
# WRONG - dense wall of code
|
|
114
|
+
def _classify_server_error(error_msg: str, timeout: int) -> tuple[bool, str]:
|
|
115
|
+
"""Classify a server connectivity error."""
|
|
116
|
+
# Authentication errors
|
|
117
|
+
if "401" in error_msg or "Unauthorized" in error_msg:
|
|
118
|
+
return False, f"Auth failed: {error_msg}"
|
|
119
|
+
# Timeout errors
|
|
120
|
+
if "timeout" in error_msg.lower():
|
|
121
|
+
return False, f"Timeout after {timeout}s"
|
|
122
|
+
# Connection failures
|
|
123
|
+
if "Connection" in error_msg or "refused" in error_msg.lower():
|
|
124
|
+
return False, f"Cannot connect: {error_msg}"
|
|
125
|
+
# Unknown errors
|
|
126
|
+
return False, f"Server check failed: {error_msg}"
|
|
127
|
+
|
|
128
|
+
# RIGHT - each block breathes
|
|
129
|
+
def _classify_server_error(error_msg: str, timeout: int) -> tuple[bool, str]:
|
|
130
|
+
"""Classify a server connectivity error."""
|
|
131
|
+
|
|
132
|
+
# Authentication errors
|
|
133
|
+
if "401" in error_msg or "Unauthorized" in error_msg:
|
|
134
|
+
return False, f"Auth failed: {error_msg}"
|
|
135
|
+
|
|
136
|
+
# Timeout errors
|
|
137
|
+
if "timeout" in error_msg.lower():
|
|
138
|
+
return False, f"Timeout after {timeout}s"
|
|
139
|
+
|
|
140
|
+
# Connection failures
|
|
141
|
+
if "Connection" in error_msg or "refused" in error_msg.lower():
|
|
142
|
+
return False, f"Cannot connect: {error_msg}"
|
|
143
|
+
|
|
144
|
+
# Unknown errors
|
|
145
|
+
return False, f"Server check failed: {error_msg}"
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
**Rule:** Blank line after docstring and before each comment-headed block.
|
|
149
|
+
|
|
150
|
+
### 5. Early Exit Over Branching
|
|
151
|
+
|
|
152
|
+
When one branch of an `if/else` is short (log, return, continue), flip it into a guard clause and remove the `else`. This reduces cognitive load — the reader can forget the short case before reading the main logic.
|
|
153
|
+
|
|
154
|
+
```python
|
|
155
|
+
# BAD - reader must hold both branches in mind
|
|
156
|
+
def _send_notifications(errors: list[dict], recipients: list[str]) -> None:
|
|
157
|
+
"""Send error notifications to recipients."""
|
|
158
|
+
|
|
159
|
+
# Check for errors and notify
|
|
160
|
+
if errors:
|
|
161
|
+
summary = f"{len(errors)} errors detected"
|
|
162
|
+
for recipient in recipients:
|
|
163
|
+
_send_email(recipient, summary, errors)
|
|
164
|
+
logger.info(f"Notified {len(recipients)} recipients")
|
|
165
|
+
else:
|
|
166
|
+
logger.info("No errors to report")
|
|
167
|
+
|
|
168
|
+
# GOOD - early exit eliminates the else branch
|
|
169
|
+
def _send_notifications(errors: list[dict], recipients: list[str]) -> None:
|
|
170
|
+
"""Send error notifications to recipients."""
|
|
171
|
+
|
|
172
|
+
# Nothing to report
|
|
173
|
+
if not errors:
|
|
174
|
+
logger.info("No errors to report")
|
|
175
|
+
return
|
|
176
|
+
|
|
177
|
+
# Notify each recipient
|
|
178
|
+
summary = f"{len(errors)} errors detected"
|
|
179
|
+
for recipient in recipients:
|
|
180
|
+
_send_email(recipient, summary, errors)
|
|
181
|
+
logger.info(f"Notified {len(recipients)} recipients")
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
**When to flip:** The short branch should be 1-3 lines. If both branches are equally complex, an `if/else` is fine.
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
# ===========================================
|
|
2
|
+
# Labs Parser Configuration
|
|
3
|
+
# ===========================================
|
|
4
|
+
|
|
5
|
+
# Required: OpenRouter API key (get one at https://openrouter.ai/keys)
|
|
6
|
+
OPENROUTER_API_KEY=<your API key>
|
|
7
|
+
|
|
8
|
+
# Required: Model configuration
|
|
9
|
+
# Vision model for extraction
|
|
10
|
+
# Options by provider (updated January 2026):
|
|
11
|
+
# Anthropic: anthropic/claude-opus-4.5, anthropic/claude-sonnet-4, anthropic/claude-haiku-4.5
|
|
12
|
+
# Google: google/gemini-3-flash-preview, google/gemini-3-flash-preview
|
|
13
|
+
# OpenAI: openai/gpt-5.2, openai/gpt-4.1, openai/gpt-4.1-mini
|
|
14
|
+
# Qwen: qwen/qwen3-max, qwen/qwen3-vl-32b-instruct
|
|
15
|
+
EXTRACT_MODEL_ID=google/gemini-3-flash-preview
|
|
16
|
+
|
|
17
|
+
# ===========================================
|
|
18
|
+
# Optional Settings
|
|
19
|
+
# ===========================================
|
|
20
|
+
|
|
21
|
+
# Parallel workers for PDF processing (default: CPU count)
|
|
22
|
+
# MAX_WORKERS=4
|
|
23
|
+
|
|
24
|
+
# OpenRouter API base URL (default: https://openrouter.ai/api/v1)
|
|
25
|
+
# Useful for proxies or alternative endpoints
|
|
26
|
+
# OPENROUTER_BASE_URL=https://openrouter.ai/api/v1
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
name: Release
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
paths:
|
|
7
|
+
- "parselabs/__init__.py"
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
test:
|
|
11
|
+
uses: tsilva/.github/.github/workflows/test.yml@main
|
|
12
|
+
|
|
13
|
+
pii-scan:
|
|
14
|
+
uses: tsilva/.github/.github/workflows/pii-scan.yml@main
|
|
15
|
+
|
|
16
|
+
build-and-publish:
|
|
17
|
+
needs: [test, pii-scan]
|
|
18
|
+
runs-on: ubuntu-latest
|
|
19
|
+
permissions:
|
|
20
|
+
id-token: write
|
|
21
|
+
environment:
|
|
22
|
+
name: pypi
|
|
23
|
+
url: https://pypi.org/p/parselabs
|
|
24
|
+
steps:
|
|
25
|
+
- uses: actions/checkout@v4
|
|
26
|
+
|
|
27
|
+
- uses: actions/setup-python@v5
|
|
28
|
+
with:
|
|
29
|
+
python-version: "3.12"
|
|
30
|
+
|
|
31
|
+
- name: Install build tools
|
|
32
|
+
run: pip install build
|
|
33
|
+
|
|
34
|
+
- name: Build package
|
|
35
|
+
run: python -m build
|
|
36
|
+
|
|
37
|
+
- name: Publish to PyPI
|
|
38
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
# >>> MANAGED BY GITGUARD - DO NOT EDIT THIS SECTION <<<
|
|
2
|
+
.claude/*.local.json
|
|
3
|
+
.claude/*.local.json.bak
|
|
4
|
+
.claude-sandbox.json
|
|
5
|
+
.mcp.json
|
|
6
|
+
.env
|
|
7
|
+
.env.*
|
|
8
|
+
!.env.example
|
|
9
|
+
!.env.*.example
|
|
10
|
+
.env.*.local
|
|
11
|
+
.env.local
|
|
12
|
+
*.pem
|
|
13
|
+
*.key
|
|
14
|
+
*.p12
|
|
15
|
+
*.pfx
|
|
16
|
+
*.gpg
|
|
17
|
+
*.secret
|
|
18
|
+
*-credentials.json
|
|
19
|
+
service-account*.json
|
|
20
|
+
credentials.json
|
|
21
|
+
secrets.json
|
|
22
|
+
.secrets/
|
|
23
|
+
.aws/
|
|
24
|
+
.ssh/
|
|
25
|
+
config.local.*
|
|
26
|
+
.DS_Store
|
|
27
|
+
Thumbs.db
|
|
28
|
+
.idea/
|
|
29
|
+
.vscode/
|
|
30
|
+
*.swp
|
|
31
|
+
*.swo
|
|
32
|
+
*.code-workspace
|
|
33
|
+
__pycache__/
|
|
34
|
+
*.py[cod]
|
|
35
|
+
.venv/
|
|
36
|
+
venv/
|
|
37
|
+
env/
|
|
38
|
+
*.egg-info/
|
|
39
|
+
node_modules/
|
|
40
|
+
.npm/
|
|
41
|
+
npm-debug.log*
|
|
42
|
+
yarn-debug.log*
|
|
43
|
+
yarn-error.log*
|
|
44
|
+
dist/
|
|
45
|
+
build/
|
|
46
|
+
logs/
|
|
47
|
+
*.log
|
|
48
|
+
# >>> END MANAGED <<<
|
|
49
|
+
|
|
50
|
+
# Worktrees
|
|
51
|
+
.worktrees/
|
|
52
|
+
|
|
53
|
+
# Project-specific rules
|
|
54
|
+
!profiles/_*
|
|
55
|
+
*$py.class
|
|
56
|
+
*.cover
|
|
57
|
+
*.egg
|
|
58
|
+
*.manifest
|
|
59
|
+
*.mo
|
|
60
|
+
*.pot
|
|
61
|
+
*.py,cover
|
|
62
|
+
*.sage.py
|
|
63
|
+
*.so
|
|
64
|
+
*.spec
|
|
65
|
+
.cache
|
|
66
|
+
.coverage
|
|
67
|
+
.coverage.*
|
|
68
|
+
.dmypy.json
|
|
69
|
+
.eggs/
|
|
70
|
+
.hypothesis/
|
|
71
|
+
.installed.cfg
|
|
72
|
+
.ipynb_checkpoints
|
|
73
|
+
.mypy_cache/
|
|
74
|
+
.nox/
|
|
75
|
+
.pdm-build/
|
|
76
|
+
.pdm-python
|
|
77
|
+
.pdm.toml
|
|
78
|
+
.pybuilder/
|
|
79
|
+
.pyre/
|
|
80
|
+
.pytest_cache/
|
|
81
|
+
.Python
|
|
82
|
+
.pytype/
|
|
83
|
+
.ropeproject
|
|
84
|
+
.scrapy
|
|
85
|
+
.spyderproject
|
|
86
|
+
.spyproject
|
|
87
|
+
.tox/
|
|
88
|
+
.venv
|
|
89
|
+
.webassets-cache
|
|
90
|
+
/site
|
|
91
|
+
__pypackages__/
|
|
92
|
+
celerybeat-schedule
|
|
93
|
+
celerybeat.pid
|
|
94
|
+
config/cache/
|
|
95
|
+
cover/
|
|
96
|
+
coverage.xml
|
|
97
|
+
cython_debug/
|
|
98
|
+
db.sqlite3
|
|
99
|
+
db.sqlite3-journal
|
|
100
|
+
develop-eggs/
|
|
101
|
+
dmypy.json
|
|
102
|
+
docs/_build/
|
|
103
|
+
downloads/
|
|
104
|
+
eggs/
|
|
105
|
+
env.bak/
|
|
106
|
+
ENV/
|
|
107
|
+
htmlcov/
|
|
108
|
+
input/
|
|
109
|
+
instance/
|
|
110
|
+
ipython_config.py
|
|
111
|
+
lib/
|
|
112
|
+
lib64/
|
|
113
|
+
local_settings.py
|
|
114
|
+
MANIFEST
|
|
115
|
+
nosetests.xml
|
|
116
|
+
output.zip
|
|
117
|
+
output/
|
|
118
|
+
parts/
|
|
119
|
+
pip-delete-this-directory.txt
|
|
120
|
+
pip-log.txt
|
|
121
|
+
plots/
|
|
122
|
+
profile_default/
|
|
123
|
+
profiles/*.json
|
|
124
|
+
profiles/*.yaml
|
|
125
|
+
profiles/*.yml
|
|
126
|
+
sdist/
|
|
127
|
+
share/python-wheels/
|
|
128
|
+
target/
|
|
129
|
+
test/outputs/
|
|
130
|
+
tests/fixtures/*
|
|
131
|
+
user.json
|
|
132
|
+
user_stats.json
|
|
133
|
+
var/
|
|
134
|
+
venv.bak/
|
|
135
|
+
wheels/
|