hoard-erd 0.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hoard_erd-0.2.1/LICENSE +21 -0
- hoard_erd-0.2.1/PKG-INFO +34 -0
- hoard_erd-0.2.1/README.md +137 -0
- hoard_erd-0.2.1/pyproject.toml +68 -0
- hoard_erd-0.2.1/setup.cfg +4 -0
- hoard_erd-0.2.1/src/hoard/__init__.py +13 -0
- hoard_erd-0.2.1/src/hoard/__main__.py +11 -0
- hoard_erd-0.2.1/src/hoard/ark/__init__.py +24 -0
- hoard_erd-0.2.1/src/hoard/ark/loader.py +331 -0
- hoard_erd-0.2.1/src/hoard/ark/mapping.py +232 -0
- hoard_erd-0.2.1/src/hoard/ark/semantic_mapper.py +319 -0
- hoard_erd-0.2.1/src/hoard/benchmark/__init__.py +18 -0
- hoard_erd-0.2.1/src/hoard/benchmark/ollama_stats.py +79 -0
- hoard_erd-0.2.1/src/hoard/benchmark/vram_profiler.py +144 -0
- hoard_erd-0.2.1/src/hoard/cli/__init__.py +8 -0
- hoard_erd-0.2.1/src/hoard/cli/keys.py +156 -0
- hoard_erd-0.2.1/src/hoard/cli/main.py +404 -0
- hoard_erd-0.2.1/src/hoard/cli/run.py +346 -0
- hoard_erd-0.2.1/src/hoard/config.py +132 -0
- hoard_erd-0.2.1/src/hoard/export/__init__.py +20 -0
- hoard_erd-0.2.1/src/hoard/export/docx_writer.py +312 -0
- hoard_erd-0.2.1/src/hoard/export/pdf_writer.py +287 -0
- hoard_erd-0.2.1/src/hoard/export/photo_plates.py +186 -0
- hoard_erd-0.2.1/src/hoard/export/signatures.py +127 -0
- hoard_erd-0.2.1/src/hoard/extractors/__init__.py +22 -0
- hoard_erd-0.2.1/src/hoard/extractors/nuextract3.py +244 -0
- hoard_erd-0.2.1/src/hoard/extractors/template.py +90 -0
- hoard_erd-0.2.1/src/hoard/helpers.py +60 -0
- hoard_erd-0.2.1/src/hoard/phases/__init__.py +13 -0
- hoard_erd-0.2.1/src/hoard/phases/phase0.py +464 -0
- hoard_erd-0.2.1/src/hoard/phases/phase1.py +661 -0
- hoard_erd-0.2.1/src/hoard/phases/phase2.py +688 -0
- hoard_erd-0.2.1/src/hoard/phases/phase3.py +906 -0
- hoard_erd-0.2.1/src/hoard/phases/phase4.py +396 -0
- hoard_erd-0.2.1/src/hoard/phases/phase5.py +610 -0
- hoard_erd-0.2.1/src/hoard/providers/__init__.py +65 -0
- hoard_erd-0.2.1/src/hoard/providers/anthropic.py +222 -0
- hoard_erd-0.2.1/src/hoard/providers/credentials.py +135 -0
- hoard_erd-0.2.1/src/hoard/providers/google.py +224 -0
- hoard_erd-0.2.1/src/hoard/providers/hardware.py +218 -0
- hoard_erd-0.2.1/src/hoard/providers/ollama.py +176 -0
- hoard_erd-0.2.1/src/hoard/providers/openai.py +201 -0
- hoard_erd-0.2.1/src/hoard/providers/protocol.py +215 -0
- hoard_erd-0.2.1/src/hoard/providers/router.py +477 -0
- hoard_erd-0.2.1/src/hoard/review/__init__.py +41 -0
- hoard_erd-0.2.1/src/hoard/review/dashboard.py +651 -0
- hoard_erd-0.2.1/src/hoard/review/harris.py +434 -0
- hoard_erd-0.2.1/src/hoard/templates/__init__.py +12 -0
- hoard_erd-0.2.1/src/hoard/templates/engine.py +459 -0
- hoard_erd-0.2.1/src/hoard/workspace.py +93 -0
- hoard_erd-0.2.1/src/hoard_erd.egg-info/PKG-INFO +34 -0
- hoard_erd-0.2.1/src/hoard_erd.egg-info/SOURCES.txt +66 -0
- hoard_erd-0.2.1/src/hoard_erd.egg-info/dependency_links.txt +1 -0
- hoard_erd-0.2.1/src/hoard_erd.egg-info/entry_points.txt +2 -0
- hoard_erd-0.2.1/src/hoard_erd.egg-info/requires.txt +30 -0
- hoard_erd-0.2.1/src/hoard_erd.egg-info/top_level.txt +1 -0
- hoard_erd-0.2.1/tests/test_ark.py +548 -0
- hoard_erd-0.2.1/tests/test_benchmark.py +184 -0
- hoard_erd-0.2.1/tests/test_harris.py +264 -0
- hoard_erd-0.2.1/tests/test_nuextract3.py +126 -0
- hoard_erd-0.2.1/tests/test_phase0.py +312 -0
- hoard_erd-0.2.1/tests/test_phase1.py +309 -0
- hoard_erd-0.2.1/tests/test_phase2.py +298 -0
- hoard_erd-0.2.1/tests/test_phase3.py +510 -0
- hoard_erd-0.2.1/tests/test_phase4.py +295 -0
- hoard_erd-0.2.1/tests/test_phase5.py +318 -0
- hoard_erd-0.2.1/tests/test_review_dashboard.py +305 -0
- hoard_erd-0.2.1/tests/test_template_engine.py +288 -0
hoard_erd-0.2.1/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Marcus Quinn
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
hoard_erd-0.2.1/PKG-INFO
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: hoard-erd
|
|
3
|
+
Version: 0.2.1
|
|
4
|
+
Summary: Heritage Observation And Report Drafter — local multi-stage AI pipeline for archaeological grey literature reports
|
|
5
|
+
Author: Marcus Quinn
|
|
6
|
+
License: MIT
|
|
7
|
+
Requires-Python: >=3.11
|
|
8
|
+
License-File: LICENSE
|
|
9
|
+
Requires-Dist: typer>=0.12
|
|
10
|
+
Requires-Dist: rich>=13
|
|
11
|
+
Requires-Dist: pillow>=10
|
|
12
|
+
Requires-Dist: opencv-python-headless>=4.9
|
|
13
|
+
Requires-Dist: wand>=0.6
|
|
14
|
+
Requires-Dist: pandas>=2
|
|
15
|
+
Requires-Dist: openpyxl>=3
|
|
16
|
+
Requires-Dist: pyyaml>=6
|
|
17
|
+
Requires-Dist: heritage-models>=1.0
|
|
18
|
+
Provides-Extra: dev
|
|
19
|
+
Requires-Dist: pytest>=8; extra == "dev"
|
|
20
|
+
Requires-Dist: pytest-cov>=5; extra == "dev"
|
|
21
|
+
Requires-Dist: ruff>=0.4; extra == "dev"
|
|
22
|
+
Requires-Dist: mypy>=1; extra == "dev"
|
|
23
|
+
Provides-Extra: ark
|
|
24
|
+
Requires-Dist: sentence-transformers>=3; extra == "ark"
|
|
25
|
+
Requires-Dist: numpy>=1.24; extra == "ark"
|
|
26
|
+
Provides-Extra: ocr
|
|
27
|
+
Requires-Dist: transformers>=4.40; extra == "ocr"
|
|
28
|
+
Requires-Dist: torch>=2; extra == "ocr"
|
|
29
|
+
Requires-Dist: accelerate>=0.30; extra == "ocr"
|
|
30
|
+
Provides-Extra: llm
|
|
31
|
+
Requires-Dist: llama-cpp-python>=0.3; extra == "llm"
|
|
32
|
+
Provides-Extra: doc
|
|
33
|
+
Requires-Dist: python-docx>=1; extra == "doc"
|
|
34
|
+
Dynamic: license-file
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
# HOARD
|
|
2
|
+
|
|
3
|
+
**Heritage Observation And Report Drafter**
|
|
4
|
+
|
|
5
|
+
A fully local, multi-stage AI pipeline that converts archaeological field data — context sheets, finds catalogues, site photographs, section drawings, and sample results — into a near-publication-ready grey literature report conforming to the relevant heritage authority standard.
|
|
6
|
+
|
|
7
|
+
Targets 8 GB VRAM consumer GPUs. Runs entirely on-device via Ollama — zero API calls, zero data leaves your machine.
|
|
8
|
+
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
## Features
|
|
12
|
+
|
|
13
|
+
### End-to-End Report Generation
|
|
14
|
+
Converts raw field records (context sheets, finds catalogues, photographs, section drawings, sample results) into a complete grey literature report in six automated phases — from file triage through to publication-ready DOCX, PDF/A-2b, TEI-XML, and ZIP export. All six phases are implemented and E2E-verified on real archaeological data.
|
|
15
|
+
|
|
16
|
+
### Multi-Provider AI
|
|
17
|
+
Switch between four AI backends per pipeline phase — **Ollama** (local GPU), **OpenAI**, **Anthropic Claude**, and **Google Gemini** — with intelligent routing based on task requirements, privacy constraints, and hardware availability. Configure once; HOARD selects the optimal provider automatically.
|
|
18
|
+
|
|
19
|
+
### Hardware Tier System
|
|
20
|
+
Auto-detects your GPU, VRAM, and Ollama models on first run and suggests an appropriate tier:
|
|
21
|
+
- **Ultra-light** — no GPU needed, cloud-only inference
|
|
22
|
+
- **Budget** — 6 GB VRAM, compact local models
|
|
23
|
+
- **Standard** — 8-12 GB VRAM, full local pipeline
|
|
24
|
+
- **Performance** — 16-24 GB VRAM, high-end local models
|
|
25
|
+
|
|
26
|
+
### 14 Jurisdiction Templates
|
|
27
|
+
Reports conform to heritage authority standards in England, Scotland, Wales, Ireland, Netherlands, France, Germany, US, Canada, Australia, New Zealand, and South Africa — all driven by declarative YAML templates. Adding a new jurisdiction means writing one YAML file; no code changes required.
|
|
28
|
+
|
|
29
|
+
### Interactive Review Dashboard
|
|
30
|
+
After each pipeline phase, a terminal TUI presents flagged items (blurred images, low-confidence OCR, spatial mismatches, compliance warnings) for Accept/Edit/Defer review. Corrections write back to the workspace and update pipeline state for re-runnable workflows.
|
|
31
|
+
|
|
32
|
+
### Offline Getty Vocabulary
|
|
33
|
+
Standardises materials, periods, and artefact types against Getty AAT/ULAN/TGN terms using the `heritage-vocab` library — works offline with a built-in fallback covering common archaeological terms. No API calls required.
|
|
34
|
+
|
|
35
|
+
### Harris Matrix Generator
|
|
36
|
+
Pure-Python SVG stratigraphic matrix from context relationships. Colour-coded by period, arrows from later to earlier contexts. No graphviz or external tools needed.
|
|
37
|
+
|
|
38
|
+
### Cryptographically Signed PDFs
|
|
39
|
+
Optional PAdES-B/LTV digital signatures via pyHanko for legally compliant report certification.
|
|
40
|
+
|
|
41
|
+
### Cloud-Ready Credential Vault
|
|
42
|
+
API keys for OpenAI, Anthropic, and Google are stored encrypted at rest (AES-256-GCM + PBKDF2) and managed via `hoard keys set/list/remove`. Cross-compatible with the Kryptis vault format.
|
|
43
|
+
|
|
44
|
+
### Ecosystem Integration
|
|
45
|
+
HOARD shares data contracts and workflows with [StratiGraph](https://github.com/mabo-du/stratigraph) (Harris Matrix viewer), Trowel (desktop report drafter), Libby (radiocarbon calibration), Cache & Carry (offline collections management), and Dibble (3D lithic analysis) — all accessible through the unified `heritage` CLI.
|
|
46
|
+
|
|
47
|
+
## Quick Start
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
# Install
|
|
51
|
+
pip install hoard # from PyPI
|
|
52
|
+
# or from source
|
|
53
|
+
git clone https://github.com/mabo-du/HOARD.git
|
|
54
|
+
cd HOARD && pip install -e ".[dev]"
|
|
55
|
+
|
|
56
|
+
# Install Ollama and pull models
|
|
57
|
+
ollama pull glm-ocr qwen3-vl:8b qwen3.5-4b gemma4
|
|
58
|
+
|
|
59
|
+
# Initialise a project
|
|
60
|
+
hoard init "Stoneyfield Farm 2026" --jurisdiction historic_england_cl3
|
|
61
|
+
|
|
62
|
+
# Run Phase 0 (no GPU needed)
|
|
63
|
+
hoard run --project stoneyfield_farm_2026 --input ./field_records --phase 0
|
|
64
|
+
|
|
65
|
+
# List available jurisdiction templates
|
|
66
|
+
hoard templates list
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
## CLI Reference
|
|
70
|
+
|
|
71
|
+
| Command | Description |
|
|
72
|
+
|---------|-------------|
|
|
73
|
+
| `hoard init <name>` | Initialise a new project |
|
|
74
|
+
| `hoard run --project <id>` | Run the pipeline (full or partial) |
|
|
75
|
+
| `hoard run --project <id> --phase <N>` | Run a single phase |
|
|
76
|
+
| `hoard run --project <id> --from-phase <N>` | Run from phase N onward |
|
|
77
|
+
| `hoard run --project <id> --strict` | Halt Phase 1 on schema validation failure |
|
|
78
|
+
| `hoard run --project <id> --extractor nuextract3` | Use NuExtract3 for Phase 1 extraction (opt-in) |
|
|
79
|
+
| `hoard import-ark --project <id> --input <dir>` | Import structured data from ARK system exports |
|
|
80
|
+
| `hoard review --project <id>` | Interactive review dashboard for flagged items |
|
|
81
|
+
| `hoard export --project <id> --format docx,pdf` | Export final report |
|
|
82
|
+
| `hoard templates list` | List available jurisdiction templates |
|
|
83
|
+
| `hoard templates show --name <code>` | Show template details with syntax highlighting |
|
|
84
|
+
| `hoard templates validate --file <path>` | Validate a template YAML file |
|
|
85
|
+
| `hoard keys set <provider> <key>` | Store an encrypted API key for cloud providers |
|
|
86
|
+
| `hoard keys list` | List configured API keys |
|
|
87
|
+
| `hoard keys unlock` | Unlock the credential vault |
|
|
88
|
+
|
|
89
|
+
## Ecosystem Integration
|
|
90
|
+
|
|
91
|
+
HOARD is one component of a broader heritage science open-source ecosystem:
|
|
92
|
+
|
|
93
|
+
| Tool | Function | Integration |
|
|
94
|
+
|------|----------|-------------|
|
|
95
|
+
| **StratiGraph** | Interactive Harris Matrix editor (Tauri 2 + React) | [Shared JSON Schema](schemas/heritage-data-package-v1.json) — HOARD Phase 1 exports import directly |
|
|
96
|
+
| **Trowel** | Desktop report drafter (PyQt6) | Bidirectional JSON import/export, shared jurisdiction templates |
|
|
97
|
+
| **Libby** | Radiocarbon calibration (FastAPI + Svelte 5) | StratiGraph exports OxCal CQL / JSON payloads to Libby |
|
|
98
|
+
| **Cache & Carry** | Offline collections management (Tauri + Rust) | Getty AAT/ULAN/TGN vocabulary for term normalisation |
|
|
99
|
+
| **Dibble** | 3D lithic analysis (Python + PyVista) | Specialist finds appendix data via JSON bridge |
|
|
100
|
+
| **heritage-cli** | Unified ecosystem CLI | `heritage run/calibrate/lithics/review/matrix/publish` |
|
|
101
|
+
|
|
102
|
+
## Jurisdiction Templates
|
|
103
|
+
|
|
104
|
+
Reports conform to national heritage authority standards via declarative YAML templates. Currently 14 jurisdictions:
|
|
105
|
+
|
|
106
|
+
| Code | Authority | Region |
|
|
107
|
+
|------|-----------|--------|
|
|
108
|
+
| `historic_england_cl3` | Historic England — Evaluation (CL3) | England |
|
|
109
|
+
| `historic_england_cl4` | Historic England — Excavation (CL4) | England |
|
|
110
|
+
| `historic_environment_scotland` | HES — Data Structure Report | Scotland |
|
|
111
|
+
| `wales_rcahmw` | Cadw / RCAHMW | Wales |
|
|
112
|
+
| `ireland_nms` | National Monuments Service | Ireland |
|
|
113
|
+
| `netherlands_kna` | KNA 5.0 | Netherlands |
|
|
114
|
+
| `france_inrap` | INRAP / Code du Patrimoine | France |
|
|
115
|
+
| `germany_denkmalpflege` | Landesdenkmalpflege | Germany |
|
|
116
|
+
| `us_section106` | Section 106 (NRHP) | United States |
|
|
117
|
+
| `canada_ontario` | Ontario S&G | Canada |
|
|
118
|
+
| `australia_burra` | Burra Charter / ICOMOS | Australia |
|
|
119
|
+
| `new_zealand` | Heritage NZ Pouhere Taonga | New Zealand |
|
|
120
|
+
| `south_africa_sahra` | SAHRA | South Africa |
|
|
121
|
+
| `international_generic` | Generic fallback | Any |
|
|
122
|
+
|
|
123
|
+
Adding a new jurisdiction means writing a single YAML file — no pipeline code changes required. Templates support `extends` inheritance for regional variations (e.g. US state-level overrides).
|
|
124
|
+
|
|
125
|
+
## Documentation
|
|
126
|
+
|
|
127
|
+
- **[Full User Guide](docs/user-guide.md)** — installation, phase walkthroughs, ARK import, review dashboard, GPU setup, troubleshooting
|
|
128
|
+
- **`hoard --help`** — inline CLI reference
|
|
129
|
+
- **Research papers** — see `docs/research-papers/` for architectural deep-dives on multi-provider AI, ecosystem integration, schema unification, and model selection
|
|
130
|
+
|
|
131
|
+
## Licence
|
|
132
|
+
|
|
133
|
+
MIT
|
|
134
|
+
|
|
135
|
+
## Contributing
|
|
136
|
+
|
|
137
|
+
See [CONTRIBUTING.md](CONTRIBUTING.md) for development setup and pull request workflow.
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "hoard-erd"
|
|
7
|
+
version = "0.2.1"
|
|
8
|
+
description = "Heritage Observation And Report Drafter — local multi-stage AI pipeline for archaeological grey literature reports"
|
|
9
|
+
requires-python = ">=3.11"
|
|
10
|
+
license = { text = "MIT" }
|
|
11
|
+
authors = [
|
|
12
|
+
{ name = "Marcus Quinn" },
|
|
13
|
+
]
|
|
14
|
+
|
|
15
|
+
dependencies = [
|
|
16
|
+
# CLI
|
|
17
|
+
"typer>=0.12",
|
|
18
|
+
"rich>=13",
|
|
19
|
+
|
|
20
|
+
# Image processing
|
|
21
|
+
"pillow>=10",
|
|
22
|
+
"opencv-python-headless>=4.9",
|
|
23
|
+
"wand>=0.6",
|
|
24
|
+
|
|
25
|
+
# Data
|
|
26
|
+
"pandas>=2",
|
|
27
|
+
"openpyxl>=3",
|
|
28
|
+
"pyyaml>=6",
|
|
29
|
+
|
|
30
|
+
# Shared data models (auto-generated from heritage-types TypeSpec)
|
|
31
|
+
"heritage-models>=1.0",
|
|
32
|
+
|
|
33
|
+
# OCR (loaded on demand)
|
|
34
|
+
# "transformers>=4.40", # TrOCR
|
|
35
|
+
# "torch>=2", # PyTorch
|
|
36
|
+
|
|
37
|
+
# Reporting
|
|
38
|
+
# pandoc is a system-level dependency, not pip
|
|
39
|
+
]
|
|
40
|
+
|
|
41
|
+
[project.optional-dependencies]
|
|
42
|
+
dev = [
|
|
43
|
+
"pytest>=8",
|
|
44
|
+
"pytest-cov>=5",
|
|
45
|
+
"ruff>=0.4",
|
|
46
|
+
"mypy>=1",
|
|
47
|
+
]
|
|
48
|
+
ark = [
|
|
49
|
+
"sentence-transformers>=3",
|
|
50
|
+
"numpy>=1.24",
|
|
51
|
+
]
|
|
52
|
+
ocr = [
|
|
53
|
+
"transformers>=4.40",
|
|
54
|
+
"torch>=2",
|
|
55
|
+
"accelerate>=0.30",
|
|
56
|
+
]
|
|
57
|
+
llm = [
|
|
58
|
+
"llama-cpp-python>=0.3",
|
|
59
|
+
]
|
|
60
|
+
doc = [
|
|
61
|
+
"python-docx>=1",
|
|
62
|
+
]
|
|
63
|
+
|
|
64
|
+
[project.scripts]
|
|
65
|
+
hoard = "hoard.cli.main:app"
|
|
66
|
+
|
|
67
|
+
[tool.setuptools.packages.find]
|
|
68
|
+
where = ["src"]
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""hoard — Heritage Observation And Report Drafter.
|
|
2
|
+
|
|
3
|
+
Multi-stage AI pipeline converting archaeological field data into
|
|
4
|
+
near-publication-ready grey literature reports. Fully local, targets
|
|
5
|
+
6 GB VRAM, jurisdiction-templated.
|
|
6
|
+
|
|
7
|
+
exports: (package) — use hoard.cli.main:app as entry point
|
|
8
|
+
used_by: pyproject.toml → `hoard` CLI command
|
|
9
|
+
rules: No model inference logic in this file; orchestration only.
|
|
10
|
+
agent: deepseek-v4-flash | 2026-05-09 | s_20260509_001 | Initial scaffold
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
__version__ = "0.1.0"
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
"""__main__.py — Allow `python -m hoard` as alternative to `hoard` CLI.
|
|
2
|
+
|
|
3
|
+
usage: python -m hoard --help
|
|
4
|
+
used_by: developer workflow
|
|
5
|
+
rules: Must delegate to hoard.cli.main:entry_point
|
|
6
|
+
agent: deepseek-v4-flash | 2026-05-09 | s_20260509_001 | Initial scaffold
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from hoard.cli.main import entry_point
|
|
10
|
+
|
|
11
|
+
entry_point()
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
"""hoard.ark — ARK system direct data input (bypasses Phase 1 OCR).
|
|
2
|
+
|
|
3
|
+
For digital-first excavations using the ARK (Archaeological Recording Kit)
|
|
4
|
+
system, structured data can be imported directly — bypassing Phase 0 file
|
|
5
|
+
ingestion and Phase 1 multi-modal digitisation.
|
|
6
|
+
|
|
7
|
+
exports: import_ark_export, ArkImportResult
|
|
8
|
+
used_by: hoard.cli.main → `hoard import-ark` command
|
|
9
|
+
rules: Must never import torch or any GPU-bound library. Exported data
|
|
10
|
+
must be compatible with Phase 5+ pipeline stages.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from hoard.ark.loader import ArkImportResult, import_ark_export
|
|
14
|
+
from hoard.ark.mapping import guess_mapping_from_header, transform_row
|
|
15
|
+
from hoard.ark.semantic_mapper import ArkSemanticMapper, map_headers_semantic
|
|
16
|
+
|
|
17
|
+
__all__ = [
|
|
18
|
+
"ArkImportResult",
|
|
19
|
+
"ArkSemanticMapper",
|
|
20
|
+
"import_ark_export",
|
|
21
|
+
"guess_mapping_from_header",
|
|
22
|
+
"map_headers_semantic",
|
|
23
|
+
"transform_row",
|
|
24
|
+
]
|
|
@@ -0,0 +1,331 @@
|
|
|
1
|
+
"""loader.py — ARK export data importer.
|
|
2
|
+
|
|
3
|
+
Discovers ARK system export files (CSV/JSON), maps fields to HOARD's
|
|
4
|
+
internal representation, and writes structured data directly into the
|
|
5
|
+
workspace — bypassing Phase 0 file ingestion and Phase 1 OCR for
|
|
6
|
+
digital-first excavations.
|
|
7
|
+
|
|
8
|
+
exports: import_ark_export, ArkImportResult
|
|
9
|
+
used_by: hoard.cli.main → `hoard import-ark` command
|
|
10
|
+
rules: Must never import torch or any GPU-bound library.
|
|
11
|
+
Generated manifests must be compatible with Phase 5+ pipeline stages.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import csv
|
|
17
|
+
import json
|
|
18
|
+
from datetime import datetime, timezone
|
|
19
|
+
from pathlib import Path
|
|
20
|
+
from typing import Any
|
|
21
|
+
|
|
22
|
+
from hoard.ark.mapping import (
|
|
23
|
+
ARK_CONTEXT_FIELDS,
|
|
24
|
+
ARK_DRAWINGS_FIELDS,
|
|
25
|
+
ARK_FINDS_FIELDS,
|
|
26
|
+
ARK_PHOTOS_FIELDS,
|
|
27
|
+
ARK_SAMPLES_FIELDS,
|
|
28
|
+
SOURCE_TYPE_MAP,
|
|
29
|
+
guess_mapping_from_header,
|
|
30
|
+
transform_row,
|
|
31
|
+
)
|
|
32
|
+
from hoard.config import Config
|
|
33
|
+
from hoard.workspace import PipelineState
|
|
34
|
+
|
|
35
|
+
# ── Data contracts ─────────────────────────────────────────────────────────
|
|
36
|
+
|
|
37
|
+
IMPORT_TABLES: dict[str, dict[str, Any]] = {
|
|
38
|
+
"context": {
|
|
39
|
+
"filename_patterns": ("context", "contexts", "ctx_register"),
|
|
40
|
+
"mapping": ARK_CONTEXT_FIELDS,
|
|
41
|
+
"output_key": "context_sheets",
|
|
42
|
+
},
|
|
43
|
+
"finds": {
|
|
44
|
+
"filename_patterns": ("finds", "small_finds", "finds_catalogue", "sf_register"),
|
|
45
|
+
"mapping": ARK_FINDS_FIELDS,
|
|
46
|
+
"output_key": "finds",
|
|
47
|
+
},
|
|
48
|
+
"samples": {
|
|
49
|
+
"filename_patterns": ("samples", "sample", "environmental"),
|
|
50
|
+
"mapping": ARK_SAMPLES_FIELDS,
|
|
51
|
+
"output_key": "samples",
|
|
52
|
+
},
|
|
53
|
+
"photos": {
|
|
54
|
+
"filename_patterns": ("photos", "photo", "images", "photo_log"),
|
|
55
|
+
"mapping": ARK_PHOTOS_FIELDS,
|
|
56
|
+
"output_key": "photos",
|
|
57
|
+
},
|
|
58
|
+
"drawings": {
|
|
59
|
+
"filename_patterns": ("drawings", "drawing", "plans", "section_drawings"),
|
|
60
|
+
"mapping": ARK_DRAWINGS_FIELDS,
|
|
61
|
+
"output_key": "drawings",
|
|
62
|
+
},
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class ArkImportResult:
|
|
67
|
+
"""Result of an ARK import operation."""
|
|
68
|
+
|
|
69
|
+
def __init__(self) -> None:
|
|
70
|
+
self.files_found: int = 0
|
|
71
|
+
self.files_parsed: int = 0
|
|
72
|
+
self.total_records: int = 0
|
|
73
|
+
self.errors: list[str] = []
|
|
74
|
+
self.warnings: list[str] = []
|
|
75
|
+
self.records_by_type: dict[str, int] = {}
|
|
76
|
+
self.manifest_path: Path | None = None
|
|
77
|
+
self.digitised_data: list[dict[str, Any]] = []
|
|
78
|
+
|
|
79
|
+
def to_dict(self) -> dict[str, Any]:
|
|
80
|
+
return {
|
|
81
|
+
"files_found": self.files_found,
|
|
82
|
+
"files_parsed": self.files_parsed,
|
|
83
|
+
"total_records": self.total_records,
|
|
84
|
+
"records_by_type": self.records_by_type,
|
|
85
|
+
"errors": self.errors,
|
|
86
|
+
"warnings": self.warnings,
|
|
87
|
+
"manifest_path": str(self.manifest_path) if self.manifest_path else None,
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
# ── File discovery ─────────────────────────────────────────────────────────
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def _discover_ark_files(input_dir: Path) -> list[tuple[Path, str]]:
|
|
95
|
+
"""Find ARK export files in input_dir.
|
|
96
|
+
|
|
97
|
+
Returns list of (file_path, source_type) tuples.
|
|
98
|
+
"""
|
|
99
|
+
found: list[tuple[Path, str]] = []
|
|
100
|
+
if not input_dir.is_dir():
|
|
101
|
+
return found
|
|
102
|
+
|
|
103
|
+
for f in sorted(input_dir.iterdir()):
|
|
104
|
+
if not f.is_file() or f.name.startswith("."):
|
|
105
|
+
continue
|
|
106
|
+
stem = f.stem.lower()
|
|
107
|
+
|
|
108
|
+
for table_name, table_config in IMPORT_TABLES.items():
|
|
109
|
+
if stem in table_config["filename_patterns"]:
|
|
110
|
+
found.append((f, table_name))
|
|
111
|
+
break
|
|
112
|
+
|
|
113
|
+
return found
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
# ── CSV parsing ─────────────────────────────────────────────────────────────
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def _parse_csv(filepath: Path) -> list[dict[str, Any]]:
|
|
120
|
+
"""Parse a CSV file into a list of row dicts."""
|
|
121
|
+
rows: list[dict[str, Any]] = []
|
|
122
|
+
try:
|
|
123
|
+
with open(filepath, newline="", encoding="utf-8-sig") as f:
|
|
124
|
+
reader = csv.DictReader(f)
|
|
125
|
+
for row in reader:
|
|
126
|
+
cleaned = {k.strip(): v.strip() if v else "" for k, v in row.items()}
|
|
127
|
+
rows.append(cleaned)
|
|
128
|
+
except Exception as e:
|
|
129
|
+
raise ValueError(f"Failed to parse CSV {filepath}: {e}") from e
|
|
130
|
+
return rows
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
# ── JSON parsing ────────────────────────────────────────────────────────────
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def _parse_json(filepath: Path) -> list[dict[str, Any]]:
|
|
137
|
+
"""Parse a JSON file into a list of row dicts.
|
|
138
|
+
|
|
139
|
+
Handles both top-level arrays and {'data': [...]} wrappers.
|
|
140
|
+
"""
|
|
141
|
+
raw = json.loads(filepath.read_text())
|
|
142
|
+
if isinstance(raw, list):
|
|
143
|
+
return raw
|
|
144
|
+
if isinstance(raw, dict):
|
|
145
|
+
# Common ARK JSON export wrappers
|
|
146
|
+
for key in ("data", "results", "records", "rows", "items"):
|
|
147
|
+
if key in raw and isinstance(raw[key], list):
|
|
148
|
+
return raw[key]
|
|
149
|
+
raise ValueError(f"Unrecognised JSON structure in {filepath}")
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
# ── Per-type processing ─────────────────────────────────────────────────────
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def _process_table(
|
|
156
|
+
filepath: Path,
|
|
157
|
+
source_type: str,
|
|
158
|
+
mapping_table: list[tuple[str, str, str | None]],
|
|
159
|
+
) -> tuple[list[dict[str, Any]], list[str]]:
|
|
160
|
+
"""Parse and transform one ARK export file.
|
|
161
|
+
|
|
162
|
+
Returns (transformed_records, warnings).
|
|
163
|
+
"""
|
|
164
|
+
warnings: list[str] = []
|
|
165
|
+
|
|
166
|
+
# Parse
|
|
167
|
+
if filepath.suffix.lower() == ".json":
|
|
168
|
+
raw_rows = _parse_json(filepath)
|
|
169
|
+
else:
|
|
170
|
+
raw_rows = _parse_csv(filepath)
|
|
171
|
+
|
|
172
|
+
if not raw_rows:
|
|
173
|
+
return [], [f"Empty file: {filepath.name}"]
|
|
174
|
+
|
|
175
|
+
# Infer mapping from first row's headers
|
|
176
|
+
header = list(raw_rows[0].keys())
|
|
177
|
+
field_map = guess_mapping_from_header(header, mapping_table)
|
|
178
|
+
|
|
179
|
+
if not field_map:
|
|
180
|
+
return [], [f"No recognised ARK fields in {filepath.name}"]
|
|
181
|
+
|
|
182
|
+
unrecognised = len(header) - len(field_map)
|
|
183
|
+
if unrecognised > 0:
|
|
184
|
+
warnings.append(f"{filepath.name}: {unrecognised} unrecognised column(s) ignored")
|
|
185
|
+
|
|
186
|
+
# Transform
|
|
187
|
+
records = [transform_row(row, field_map, source_type) for row in raw_rows]
|
|
188
|
+
|
|
189
|
+
return records, warnings
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
# ── Data-source manifest generation ─────────────────────────────────────────
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def _build_manifest(
|
|
196
|
+
project_id: str,
|
|
197
|
+
records_by_type: dict[str, int],
|
|
198
|
+
errors: list[str],
|
|
199
|
+
warnings: list[str],
|
|
200
|
+
digitised_dir: Path,
|
|
201
|
+
) -> dict[str, Any]:
|
|
202
|
+
"""Build a Phase-0-compatible manifest for ARK-imported data."""
|
|
203
|
+
manifest: dict[str, Any] = {
|
|
204
|
+
"project_id": project_id,
|
|
205
|
+
"created": datetime.now(timezone.utc).isoformat(),
|
|
206
|
+
"import_method": "ark",
|
|
207
|
+
"ark_direct_input": True,
|
|
208
|
+
"phase0_bypassed": True,
|
|
209
|
+
"phase1_bypassed": True,
|
|
210
|
+
"total_records": sum(records_by_type.values()),
|
|
211
|
+
"records_by_type": records_by_type,
|
|
212
|
+
"files": [],
|
|
213
|
+
"mandatory_check": "PASS",
|
|
214
|
+
"missing_mandatory": [],
|
|
215
|
+
"quality_warnings": 0,
|
|
216
|
+
"finds_validation_issues": [],
|
|
217
|
+
"halt": False,
|
|
218
|
+
"import_errors": errors,
|
|
219
|
+
"import_warnings": warnings,
|
|
220
|
+
"digitised_dir": str(digitised_dir),
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
# Build synthetic file entries for the manifest (Phase 5 needs these)
|
|
224
|
+
for source_type, count in records_by_type.items():
|
|
225
|
+
hoard_type = SOURCE_TYPE_MAP.get(source_type, "unknown")
|
|
226
|
+
manifest["files"].append({
|
|
227
|
+
"id": f"ark_{source_type}",
|
|
228
|
+
"path": f"ark_import/{source_type}.csv",
|
|
229
|
+
"type": hoard_type,
|
|
230
|
+
"quality": {},
|
|
231
|
+
"ark_record_count": count,
|
|
232
|
+
})
|
|
233
|
+
|
|
234
|
+
return manifest
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
# ── Digitised data ──────────────────────────────────────────────────────────
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def _write_digitised_data(
|
|
241
|
+
records: list[dict[str, Any]],
|
|
242
|
+
digitised_dir: Path,
|
|
243
|
+
prefix: str,
|
|
244
|
+
) -> None:
|
|
245
|
+
"""Write transformed records as digitised-phase-style JSON files.
|
|
246
|
+
|
|
247
|
+
Each record becomes its own JSON file (compatible with Phase 5's
|
|
248
|
+
expectation of per-record JSON in 01_digitised/).
|
|
249
|
+
"""
|
|
250
|
+
digitised_dir.mkdir(parents=True, exist_ok=True)
|
|
251
|
+
for i, record in enumerate(records):
|
|
252
|
+
record_path = digitised_dir / f"{prefix}_{i:04d}.json"
|
|
253
|
+
record_path.write_text(json.dumps(record, indent=2))
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
# ── Main import ─────────────────────────────────────────────────────────────
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
def import_ark_export(config: Config) -> ArkImportResult:
|
|
260
|
+
"""Import ARK system export data for a project.
|
|
261
|
+
|
|
262
|
+
Discovers ARK export files in config.input_dir, maps fields to HOARD's
|
|
263
|
+
internal representation, and writes structured data into the workspace.
|
|
264
|
+
|
|
265
|
+
Returns an ArkImportResult describing what was imported.
|
|
266
|
+
"""
|
|
267
|
+
result = ArkImportResult()
|
|
268
|
+
project_dir = config.project_dir
|
|
269
|
+
manifest_dir = config.manifest_dir
|
|
270
|
+
digitised_dir = config.digitised_dir
|
|
271
|
+
|
|
272
|
+
project_dir.mkdir(parents=True, exist_ok=True)
|
|
273
|
+
manifest_dir.mkdir(parents=True, exist_ok=True)
|
|
274
|
+
|
|
275
|
+
# Discover ARK export files
|
|
276
|
+
ark_files = _discover_ark_files(config.input_dir)
|
|
277
|
+
result.files_found = len(ark_files)
|
|
278
|
+
|
|
279
|
+
if not ark_files:
|
|
280
|
+
result.errors.append("No ARK export files found")
|
|
281
|
+
return result
|
|
282
|
+
|
|
283
|
+
all_records: list[dict[str, Any]] = []
|
|
284
|
+
records_by_type: dict[str, int] = {}
|
|
285
|
+
|
|
286
|
+
for filepath, source_type in ark_files:
|
|
287
|
+
mapping_table = IMPORT_TABLES[source_type]["mapping"]
|
|
288
|
+
try:
|
|
289
|
+
records, warnings = _process_table(filepath, source_type, mapping_table)
|
|
290
|
+
except ValueError as e:
|
|
291
|
+
result.errors.append(str(e))
|
|
292
|
+
continue
|
|
293
|
+
|
|
294
|
+
result.warnings.extend(warnings)
|
|
295
|
+
result.files_parsed += 1
|
|
296
|
+
|
|
297
|
+
if records:
|
|
298
|
+
prefix = source_type
|
|
299
|
+
_write_digitised_data(records, digitised_dir, prefix)
|
|
300
|
+
all_records.extend(records)
|
|
301
|
+
records_by_type[source_type] = len(records)
|
|
302
|
+
result.total_records += len(records)
|
|
303
|
+
|
|
304
|
+
result.records_by_type = records_by_type
|
|
305
|
+
result.digitised_data = all_records
|
|
306
|
+
|
|
307
|
+
# Generate manifest
|
|
308
|
+
manifest = _build_manifest(
|
|
309
|
+
config.project_id,
|
|
310
|
+
records_by_type,
|
|
311
|
+
result.errors,
|
|
312
|
+
result.warnings,
|
|
313
|
+
digitised_dir,
|
|
314
|
+
)
|
|
315
|
+
|
|
316
|
+
manifest_path = manifest_dir / "manifest.json"
|
|
317
|
+
manifest_path.write_text(json.dumps(manifest, indent=2))
|
|
318
|
+
result.manifest_path = manifest_path
|
|
319
|
+
|
|
320
|
+
# Update pipeline state: mark Phase 0 and Phase 1 as bypassed
|
|
321
|
+
state = PipelineState(project_dir / "pipeline_state.json")
|
|
322
|
+
state.complete_phase(
|
|
323
|
+
0,
|
|
324
|
+
summary=f"Bypassed via ARK import — {result.total_records} records from {result.files_parsed} files",
|
|
325
|
+
)
|
|
326
|
+
state.complete_phase(
|
|
327
|
+
1,
|
|
328
|
+
summary="Bypassed via ARK import — digital-first excavation data already structured",
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
return result
|