parselabs 0.1.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. parselabs-0.1.2/.claude/skills/code-clarity/SKILL.md +184 -0
  2. parselabs-0.1.2/.claude-sandbox.Dockerfile +4 -0
  3. parselabs-0.1.2/.env.example +26 -0
  4. parselabs-0.1.2/.github/dependabot.yml +8 -0
  5. parselabs-0.1.2/.github/workflows/release.yml +38 -0
  6. parselabs-0.1.2/.gitignore +135 -0
  7. parselabs-0.1.2/.pre-commit-config.yaml +11 -0
  8. parselabs-0.1.2/AGENTS.md +318 -0
  9. parselabs-0.1.2/CLAUDE.md +1 -0
  10. parselabs-0.1.2/LICENSE +21 -0
  11. parselabs-0.1.2/Makefile +5 -0
  12. parselabs-0.1.2/PKG-INFO +243 -0
  13. parselabs-0.1.2/README.md +206 -0
  14. parselabs-0.1.2/REQUIREMENTS.md +85 -0
  15. parselabs-0.1.2/config/lab_specs.json +4729 -0
  16. parselabs-0.1.2/docs/pipeline.md +309 -0
  17. parselabs-0.1.2/logo.png +0 -0
  18. parselabs-0.1.2/main.py +1615 -0
  19. parselabs-0.1.2/parselabs/__init__.py +68 -0
  20. parselabs-0.1.2/parselabs/cli.py +18 -0
  21. parselabs-0.1.2/parselabs/config.py +412 -0
  22. parselabs-0.1.2/parselabs/exceptions.py +13 -0
  23. parselabs-0.1.2/parselabs/extraction.py +816 -0
  24. parselabs-0.1.2/parselabs/normalization.py +1307 -0
  25. parselabs-0.1.2/parselabs/standardization.py +143 -0
  26. parselabs-0.1.2/parselabs/utils.py +184 -0
  27. parselabs-0.1.2/parselabs/validation.py +683 -0
  28. parselabs-0.1.2/pipeline.md +228 -0
  29. parselabs-0.1.2/profiles/template.yaml.example +19 -0
  30. parselabs-0.1.2/prompts/conversion_factor_system.md +1 -0
  31. parselabs-0.1.2/prompts/conversion_factor_user.md +4 -0
  32. parselabs-0.1.2/prompts/extraction_system.md +160 -0
  33. parselabs-0.1.2/prompts/extraction_user.md +30 -0
  34. parselabs-0.1.2/prompts/health_range_system.md +1 -0
  35. parselabs-0.1.2/prompts/health_range_user.md +4 -0
  36. parselabs-0.1.2/prompts/name_standardization.md +54 -0
  37. parselabs-0.1.2/prompts/text_extraction_user.md +26 -0
  38. parselabs-0.1.2/prompts/unit_standardization.md +52 -0
  39. parselabs-0.1.2/pyproject.toml +64 -0
  40. parselabs-0.1.2/static/viewer.css +185 -0
  41. parselabs-0.1.2/static/viewer.js +30 -0
  42. parselabs-0.1.2/test.py +358 -0
  43. parselabs-0.1.2/utils/README.md +113 -0
  44. parselabs-0.1.2/utils/analyze_unknowns.py +359 -0
  45. parselabs-0.1.2/utils/cleanup_removed_fields.py +169 -0
  46. parselabs-0.1.2/utils/lab_specs_manager.py +412 -0
  47. parselabs-0.1.2/utils/migrate_output_dirs.py +176 -0
  48. parselabs-0.1.2/utils/migrate_raw_columns.py +191 -0
  49. parselabs-0.1.2/utils/update_standardization_caches.py +251 -0
  50. parselabs-0.1.2/utils/validate_lab_specs_schema.py +457 -0
  51. parselabs-0.1.2/uv.lock +2035 -0
  52. parselabs-0.1.2/viewer.py +1985 -0
@@ -0,0 +1,184 @@
1
+ ---
2
+ name: code-clarity
3
+ description: Code quality and clarity guidelines for Python - enforces flat orchestrator patterns, explicit error handling, and mandatory comments
4
+ ---
5
+
6
+ # Code Clarity Skill
7
+
8
+ ## Core Rules
9
+
10
+ ### 1. Flat Orchestrator
11
+
12
+ Main functions own all flow: max 2 indent levels, early returns as guards, helpers extracted for deeper logic.
13
+
14
+ ```python
15
+ def process_single_pdf(pdf_path: Path) -> tuple[Path | None, list[dict]]:
16
+ """Process a single PDF: extract, standardize, save."""
17
+ # Initialize directory structure
18
+ doc_out_dir, csv_path, failed_pages = _setup_paths(pdf_path)
19
+
20
+ # Guard: Skip if no pages found
21
+ pages = _convert_to_images(pdf_path)
22
+ if not pages:
23
+ return None, []
24
+
25
+ # Extract lab data via vision model
26
+ try:
27
+ data = _extract_via_vision(pdf_path, pages)
28
+ except ExtractionError as e:
29
+ logger.error(f"Extraction failed: {e}")
30
+ return None, failed_pages
31
+
32
+ # Guard: No results extracted
33
+ if not data:
34
+ return _handle_empty_results(csv_path), failed_pages
35
+
36
+ # Normalize and save
37
+ _apply_standardization(data)
38
+ _save_results(data, csv_path)
39
+ return csv_path, failed_pages
40
+ ```
41
+
42
+ **Rules embedded in this example:**
43
+ - **Max 2 indent levels** — `try` block is the deepest nesting
44
+ - **Early return over else** — guards exit early, remaining code stays flat
45
+ - **Extract helpers** — `_extract_via_vision`, `_apply_standardization` keep orchestrator readable
46
+ - **Verb-based helper names** — `_setup_paths`, `_convert_to_images`, `_extract_via_vision`
47
+ - **Read top-to-bottom** — all decision points visible in main flow
48
+
49
+ **When to extract a helper:**
50
+ - Indentation would reach level 3+
51
+ - Block exceeds 5-7 contiguous lines
52
+ - Logic is independently testable or reusable
53
+
54
+ ### 2. Errors Propagate Up
55
+
56
+ Helpers raise exceptions. Orchestrators catch and decide.
57
+
58
+ ```python
59
+ # BAD - Helper hides failure
60
+ def _fetch_data():
61
+ try:
62
+ return api.call()
63
+ except Exception:
64
+ return None # Caller can't distinguish failure from empty
65
+
66
+ # GOOD - Helper raises, orchestrator decides
67
+ def _fetch_data():
68
+ return api.call() # Let it raise
69
+
70
+ def process():
71
+ try:
72
+ data = _fetch_data()
73
+ except APIError as e:
74
+ logger.error(f"Fetch failed: {e}")
75
+ return None
76
+ return transform(data)
77
+ ```
78
+
79
+ ### 3. Comment Every Branch
80
+
81
+ Every guard clause, if/elif/else branch, and logical block gets a comment explaining intent.
82
+
83
+ ```python
84
+ # Build and validate configuration
85
+ config, errors = build_config(args)
86
+
87
+ # Guard: Bail if validation failed
88
+ if errors:
89
+ return None, errors
90
+
91
+ # Check error type for appropriate response
92
+ if "401" in error_msg:
93
+ # Authentication failure - credentials invalid
94
+ return False, "Auth failed"
95
+ elif "timeout" in error_msg.lower():
96
+ # Server didn't respond in time
97
+ return False, "Server timeout"
98
+ else:
99
+ # Unknown error - fail safe
100
+ return False, "Unknown error"
101
+ ```
102
+
103
+ **Mandatory check** — verify a comment exists before every:
104
+ - `return` / `continue` / `break` guard
105
+ - `if` / `elif` / `else` branch
106
+ - Logical block (group of related statements)
107
+
108
+ ### 4. Blank Lines Between Blocks
109
+
110
+ Separate each comment-headed block with a blank line. This includes after the docstring.
111
+
112
+ ```python
113
+ # WRONG - dense wall of code
114
+ def _classify_server_error(error_msg: str, timeout: int) -> tuple[bool, str]:
115
+ """Classify a server connectivity error."""
116
+ # Authentication errors
117
+ if "401" in error_msg or "Unauthorized" in error_msg:
118
+ return False, f"Auth failed: {error_msg}"
119
+ # Timeout errors
120
+ if "timeout" in error_msg.lower():
121
+ return False, f"Timeout after {timeout}s"
122
+ # Connection failures
123
+ if "Connection" in error_msg or "refused" in error_msg.lower():
124
+ return False, f"Cannot connect: {error_msg}"
125
+ # Unknown errors
126
+ return False, f"Server check failed: {error_msg}"
127
+
128
+ # RIGHT - each block breathes
129
+ def _classify_server_error(error_msg: str, timeout: int) -> tuple[bool, str]:
130
+ """Classify a server connectivity error."""
131
+
132
+ # Authentication errors
133
+ if "401" in error_msg or "Unauthorized" in error_msg:
134
+ return False, f"Auth failed: {error_msg}"
135
+
136
+ # Timeout errors
137
+ if "timeout" in error_msg.lower():
138
+ return False, f"Timeout after {timeout}s"
139
+
140
+ # Connection failures
141
+ if "Connection" in error_msg or "refused" in error_msg.lower():
142
+ return False, f"Cannot connect: {error_msg}"
143
+
144
+ # Unknown errors
145
+ return False, f"Server check failed: {error_msg}"
146
+ ```
147
+
148
+ **Rule:** Blank line after docstring and before each comment-headed block.
149
+
150
+ ### 5. Early Exit Over Branching
151
+
152
+ When one branch of an `if/else` is short (log, return, continue), flip it into a guard clause and remove the `else`. This reduces cognitive load — the reader can forget the short case before reading the main logic.
153
+
154
+ ```python
155
+ # BAD - reader must hold both branches in mind
156
+ def _send_notifications(errors: list[dict], recipients: list[str]) -> None:
157
+ """Send error notifications to recipients."""
158
+
159
+ # Check for errors and notify
160
+ if errors:
161
+ summary = f"{len(errors)} errors detected"
162
+ for recipient in recipients:
163
+ _send_email(recipient, summary, errors)
164
+ logger.info(f"Notified {len(recipients)} recipients")
165
+ else:
166
+ logger.info("No errors to report")
167
+
168
+ # GOOD - early exit eliminates the else branch
169
+ def _send_notifications(errors: list[dict], recipients: list[str]) -> None:
170
+ """Send error notifications to recipients."""
171
+
172
+ # Nothing to report
173
+ if not errors:
174
+ logger.info("No errors to report")
175
+ return
176
+
177
+ # Notify each recipient
178
+ summary = f"{len(errors)} errors detected"
179
+ for recipient in recipients:
180
+ _send_email(recipient, summary, errors)
181
+ logger.info(f"Notified {len(recipients)} recipients")
182
+ ```
183
+
184
+ **When to flip:** The short branch should be 1-3 lines. If both branches are equally complex, an `if/else` is fine.
@@ -0,0 +1,4 @@
1
+ FROM claude-sandbox:latest
2
+ USER root
3
+ RUN apt-get update && apt-get install -y poppler-utils && rm -rf /var/lib/apt/lists/*
4
+ USER claude
@@ -0,0 +1,26 @@
1
+ # ===========================================
2
+ # Labs Parser Configuration
3
+ # ===========================================
4
+
5
+ # Required: OpenRouter API key (get one at https://openrouter.ai/keys)
6
+ OPENROUTER_API_KEY=<your API key>
7
+
8
+ # Required: Model configuration
9
+ # Vision model for extraction
10
+ # Options by provider (updated January 2026):
11
+ # Anthropic: anthropic/claude-opus-4.5, anthropic/claude-sonnet-4, anthropic/claude-haiku-4.5
12
+ # Google: google/gemini-3-flash-preview, google/gemini-3-flash-preview
13
+ # OpenAI: openai/gpt-5.2, openai/gpt-4.1, openai/gpt-4.1-mini
14
+ # Qwen: qwen/qwen3-max, qwen/qwen3-vl-32b-instruct
15
+ EXTRACT_MODEL_ID=google/gemini-3-flash-preview
16
+
17
+ # ===========================================
18
+ # Optional Settings
19
+ # ===========================================
20
+
21
+ # Parallel workers for PDF processing (default: CPU count)
22
+ # MAX_WORKERS=4
23
+
24
+ # OpenRouter API base URL (default: https://openrouter.ai/api/v1)
25
+ # Useful for proxies or alternative endpoints
26
+ # OPENROUTER_BASE_URL=https://openrouter.ai/api/v1
@@ -0,0 +1,8 @@
1
+ # Dependabot configuration for automated dependency updates
2
+ # https://docs.github.com/en/code-security/dependabot/dependabot-version-updates
3
+ version: 2
4
+ updates:
5
+ - package-ecosystem: "pip"
6
+ directory: "/"
7
+ schedule:
8
+ interval: "weekly"
@@ -0,0 +1,38 @@
1
+ name: Release
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ paths:
7
+ - "parselabs/__init__.py"
8
+
9
+ jobs:
10
+ test:
11
+ uses: tsilva/.github/.github/workflows/test.yml@main
12
+
13
+ pii-scan:
14
+ uses: tsilva/.github/.github/workflows/pii-scan.yml@main
15
+
16
+ build-and-publish:
17
+ needs: [test, pii-scan]
18
+ runs-on: ubuntu-latest
19
+ permissions:
20
+ id-token: write
21
+ environment:
22
+ name: pypi
23
+ url: https://pypi.org/p/parselabs
24
+ steps:
25
+ - uses: actions/checkout@v4
26
+
27
+ - uses: actions/setup-python@v5
28
+ with:
29
+ python-version: "3.12"
30
+
31
+ - name: Install build tools
32
+ run: pip install build
33
+
34
+ - name: Build package
35
+ run: python -m build
36
+
37
+ - name: Publish to PyPI
38
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,135 @@
1
+ # >>> MANAGED BY GITGUARD - DO NOT EDIT THIS SECTION <<<
2
+ .claude/*.local.json
3
+ .claude/*.local.json.bak
4
+ .claude-sandbox.json
5
+ .mcp.json
6
+ .env
7
+ .env.*
8
+ !.env.example
9
+ !.env.*.example
10
+ .env.*.local
11
+ .env.local
12
+ *.pem
13
+ *.key
14
+ *.p12
15
+ *.pfx
16
+ *.gpg
17
+ *.secret
18
+ *-credentials.json
19
+ service-account*.json
20
+ credentials.json
21
+ secrets.json
22
+ .secrets/
23
+ .aws/
24
+ .ssh/
25
+ config.local.*
26
+ .DS_Store
27
+ Thumbs.db
28
+ .idea/
29
+ .vscode/
30
+ *.swp
31
+ *.swo
32
+ *.code-workspace
33
+ __pycache__/
34
+ *.py[cod]
35
+ .venv/
36
+ venv/
37
+ env/
38
+ *.egg-info/
39
+ node_modules/
40
+ .npm/
41
+ npm-debug.log*
42
+ yarn-debug.log*
43
+ yarn-error.log*
44
+ dist/
45
+ build/
46
+ logs/
47
+ *.log
48
+ # >>> END MANAGED <<<
49
+
50
+ # Worktrees
51
+ .worktrees/
52
+
53
+ # Project-specific rules
54
+ !profiles/_*
55
+ *$py.class
56
+ *.cover
57
+ *.egg
58
+ *.manifest
59
+ *.mo
60
+ *.pot
61
+ *.py,cover
62
+ *.sage.py
63
+ *.so
64
+ *.spec
65
+ .cache
66
+ .coverage
67
+ .coverage.*
68
+ .dmypy.json
69
+ .eggs/
70
+ .hypothesis/
71
+ .installed.cfg
72
+ .ipynb_checkpoints
73
+ .mypy_cache/
74
+ .nox/
75
+ .pdm-build/
76
+ .pdm-python
77
+ .pdm.toml
78
+ .pybuilder/
79
+ .pyre/
80
+ .pytest_cache/
81
+ .Python
82
+ .pytype/
83
+ .ropeproject
84
+ .scrapy
85
+ .spyderproject
86
+ .spyproject
87
+ .tox/
88
+ .venv
89
+ .webassets-cache
90
+ /site
91
+ __pypackages__/
92
+ celerybeat-schedule
93
+ celerybeat.pid
94
+ config/cache/
95
+ cover/
96
+ coverage.xml
97
+ cython_debug/
98
+ db.sqlite3
99
+ db.sqlite3-journal
100
+ develop-eggs/
101
+ dmypy.json
102
+ docs/_build/
103
+ downloads/
104
+ eggs/
105
+ env.bak/
106
+ ENV/
107
+ htmlcov/
108
+ input/
109
+ instance/
110
+ ipython_config.py
111
+ lib/
112
+ lib64/
113
+ local_settings.py
114
+ MANIFEST
115
+ nosetests.xml
116
+ output.zip
117
+ output/
118
+ parts/
119
+ pip-delete-this-directory.txt
120
+ pip-log.txt
121
+ plots/
122
+ profile_default/
123
+ profiles/*.json
124
+ profiles/*.yaml
125
+ profiles/*.yml
126
+ sdist/
127
+ share/python-wheels/
128
+ target/
129
+ test/outputs/
130
+ tests/fixtures/*
131
+ user.json
132
+ user_stats.json
133
+ var/
134
+ venv.bak/
135
+ wheels/
@@ -0,0 +1,11 @@
1
+ repos:
2
+ - repo: https://github.com/astral-sh/ruff-pre-commit
3
+ rev: v0.15.2
4
+ hooks:
5
+ - id: ruff
6
+ args: [--fix]
7
+ - id: ruff-format
8
+ - repo: https://github.com/tsilva/.github
9
+ rev: v0.1.0
10
+ hooks:
11
+ - id: gitleaks