hoard-erd 0.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. hoard_erd-0.2.1/LICENSE +21 -0
  2. hoard_erd-0.2.1/PKG-INFO +34 -0
  3. hoard_erd-0.2.1/README.md +137 -0
  4. hoard_erd-0.2.1/pyproject.toml +68 -0
  5. hoard_erd-0.2.1/setup.cfg +4 -0
  6. hoard_erd-0.2.1/src/hoard/__init__.py +13 -0
  7. hoard_erd-0.2.1/src/hoard/__main__.py +11 -0
  8. hoard_erd-0.2.1/src/hoard/ark/__init__.py +24 -0
  9. hoard_erd-0.2.1/src/hoard/ark/loader.py +331 -0
  10. hoard_erd-0.2.1/src/hoard/ark/mapping.py +232 -0
  11. hoard_erd-0.2.1/src/hoard/ark/semantic_mapper.py +319 -0
  12. hoard_erd-0.2.1/src/hoard/benchmark/__init__.py +18 -0
  13. hoard_erd-0.2.1/src/hoard/benchmark/ollama_stats.py +79 -0
  14. hoard_erd-0.2.1/src/hoard/benchmark/vram_profiler.py +144 -0
  15. hoard_erd-0.2.1/src/hoard/cli/__init__.py +8 -0
  16. hoard_erd-0.2.1/src/hoard/cli/keys.py +156 -0
  17. hoard_erd-0.2.1/src/hoard/cli/main.py +404 -0
  18. hoard_erd-0.2.1/src/hoard/cli/run.py +346 -0
  19. hoard_erd-0.2.1/src/hoard/config.py +132 -0
  20. hoard_erd-0.2.1/src/hoard/export/__init__.py +20 -0
  21. hoard_erd-0.2.1/src/hoard/export/docx_writer.py +312 -0
  22. hoard_erd-0.2.1/src/hoard/export/pdf_writer.py +287 -0
  23. hoard_erd-0.2.1/src/hoard/export/photo_plates.py +186 -0
  24. hoard_erd-0.2.1/src/hoard/export/signatures.py +127 -0
  25. hoard_erd-0.2.1/src/hoard/extractors/__init__.py +22 -0
  26. hoard_erd-0.2.1/src/hoard/extractors/nuextract3.py +244 -0
  27. hoard_erd-0.2.1/src/hoard/extractors/template.py +90 -0
  28. hoard_erd-0.2.1/src/hoard/helpers.py +60 -0
  29. hoard_erd-0.2.1/src/hoard/phases/__init__.py +13 -0
  30. hoard_erd-0.2.1/src/hoard/phases/phase0.py +464 -0
  31. hoard_erd-0.2.1/src/hoard/phases/phase1.py +661 -0
  32. hoard_erd-0.2.1/src/hoard/phases/phase2.py +688 -0
  33. hoard_erd-0.2.1/src/hoard/phases/phase3.py +906 -0
  34. hoard_erd-0.2.1/src/hoard/phases/phase4.py +396 -0
  35. hoard_erd-0.2.1/src/hoard/phases/phase5.py +610 -0
  36. hoard_erd-0.2.1/src/hoard/providers/__init__.py +65 -0
  37. hoard_erd-0.2.1/src/hoard/providers/anthropic.py +222 -0
  38. hoard_erd-0.2.1/src/hoard/providers/credentials.py +135 -0
  39. hoard_erd-0.2.1/src/hoard/providers/google.py +224 -0
  40. hoard_erd-0.2.1/src/hoard/providers/hardware.py +218 -0
  41. hoard_erd-0.2.1/src/hoard/providers/ollama.py +176 -0
  42. hoard_erd-0.2.1/src/hoard/providers/openai.py +201 -0
  43. hoard_erd-0.2.1/src/hoard/providers/protocol.py +215 -0
  44. hoard_erd-0.2.1/src/hoard/providers/router.py +477 -0
  45. hoard_erd-0.2.1/src/hoard/review/__init__.py +41 -0
  46. hoard_erd-0.2.1/src/hoard/review/dashboard.py +651 -0
  47. hoard_erd-0.2.1/src/hoard/review/harris.py +434 -0
  48. hoard_erd-0.2.1/src/hoard/templates/__init__.py +12 -0
  49. hoard_erd-0.2.1/src/hoard/templates/engine.py +459 -0
  50. hoard_erd-0.2.1/src/hoard/workspace.py +93 -0
  51. hoard_erd-0.2.1/src/hoard_erd.egg-info/PKG-INFO +34 -0
  52. hoard_erd-0.2.1/src/hoard_erd.egg-info/SOURCES.txt +66 -0
  53. hoard_erd-0.2.1/src/hoard_erd.egg-info/dependency_links.txt +1 -0
  54. hoard_erd-0.2.1/src/hoard_erd.egg-info/entry_points.txt +2 -0
  55. hoard_erd-0.2.1/src/hoard_erd.egg-info/requires.txt +30 -0
  56. hoard_erd-0.2.1/src/hoard_erd.egg-info/top_level.txt +1 -0
  57. hoard_erd-0.2.1/tests/test_ark.py +548 -0
  58. hoard_erd-0.2.1/tests/test_benchmark.py +184 -0
  59. hoard_erd-0.2.1/tests/test_harris.py +264 -0
  60. hoard_erd-0.2.1/tests/test_nuextract3.py +126 -0
  61. hoard_erd-0.2.1/tests/test_phase0.py +312 -0
  62. hoard_erd-0.2.1/tests/test_phase1.py +309 -0
  63. hoard_erd-0.2.1/tests/test_phase2.py +298 -0
  64. hoard_erd-0.2.1/tests/test_phase3.py +510 -0
  65. hoard_erd-0.2.1/tests/test_phase4.py +295 -0
  66. hoard_erd-0.2.1/tests/test_phase5.py +318 -0
  67. hoard_erd-0.2.1/tests/test_review_dashboard.py +305 -0
  68. hoard_erd-0.2.1/tests/test_template_engine.py +288 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Marcus Quinn
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,34 @@
1
+ Metadata-Version: 2.4
2
+ Name: hoard-erd
3
+ Version: 0.2.1
4
+ Summary: Heritage Observation And Report Drafter — local multi-stage AI pipeline for archaeological grey literature reports
5
+ Author: Marcus Quinn
6
+ License: MIT
7
+ Requires-Python: >=3.11
8
+ License-File: LICENSE
9
+ Requires-Dist: typer>=0.12
10
+ Requires-Dist: rich>=13
11
+ Requires-Dist: pillow>=10
12
+ Requires-Dist: opencv-python-headless>=4.9
13
+ Requires-Dist: wand>=0.6
14
+ Requires-Dist: pandas>=2
15
+ Requires-Dist: openpyxl>=3
16
+ Requires-Dist: pyyaml>=6
17
+ Requires-Dist: heritage-models>=1.0
18
+ Provides-Extra: dev
19
+ Requires-Dist: pytest>=8; extra == "dev"
20
+ Requires-Dist: pytest-cov>=5; extra == "dev"
21
+ Requires-Dist: ruff>=0.4; extra == "dev"
22
+ Requires-Dist: mypy>=1; extra == "dev"
23
+ Provides-Extra: ark
24
+ Requires-Dist: sentence-transformers>=3; extra == "ark"
25
+ Requires-Dist: numpy>=1.24; extra == "ark"
26
+ Provides-Extra: ocr
27
+ Requires-Dist: transformers>=4.40; extra == "ocr"
28
+ Requires-Dist: torch>=2; extra == "ocr"
29
+ Requires-Dist: accelerate>=0.30; extra == "ocr"
30
+ Provides-Extra: llm
31
+ Requires-Dist: llama-cpp-python>=0.3; extra == "llm"
32
+ Provides-Extra: doc
33
+ Requires-Dist: python-docx>=1; extra == "doc"
34
+ Dynamic: license-file
@@ -0,0 +1,137 @@
1
+ # HOARD
2
+
3
+ **Heritage Observation And Report Drafter**
4
+
5
+ A fully local, multi-stage AI pipeline that converts archaeological field data — context sheets, finds catalogues, site photographs, section drawings, and sample results — into a near-publication-ready grey literature report conforming to the relevant heritage authority standard.
6
+
7
+ Targets 8 GB VRAM consumer GPUs. Runs entirely on-device via Ollama — zero API calls, zero data leaves your machine.
8
+
9
+ ---
10
+
11
+ ## Features
12
+
13
+ ### End-to-End Report Generation
14
+ Converts raw field records (context sheets, finds catalogues, photographs, section drawings, sample results) into a complete grey literature report in six automated phases — from file triage through to publication-ready DOCX, PDF/A-2b, TEI-XML, and ZIP export. All six phases are implemented and E2E-verified on real archaeological data.
15
+
16
+ ### Multi-Provider AI
17
+ Switch between four AI backends per pipeline phase — **Ollama** (local GPU), **OpenAI**, **Anthropic Claude**, and **Google Gemini** — with intelligent routing based on task requirements, privacy constraints, and hardware availability. Configure once; HOARD selects the optimal provider automatically.
18
+
19
+ ### Hardware Tier System
20
+ Auto-detects your GPU, VRAM, and Ollama models on first run and suggests an appropriate tier:
21
+ - **Ultra-light** — no GPU needed, cloud-only inference
22
+ - **Budget** — 6 GB VRAM, compact local models
23
+ - **Standard** — 8-12 GB VRAM, full local pipeline
24
+ - **Performance** — 16-24 GB VRAM, high-end local models
25
+
26
+ ### 14 Jurisdiction Templates
27
+ Reports conform to heritage authority standards in England, Scotland, Wales, Ireland, Netherlands, France, Germany, US, Canada, Australia, New Zealand, and South Africa — all driven by declarative YAML templates. Adding a new jurisdiction means writing one YAML file; no code changes required.
28
+
29
+ ### Interactive Review Dashboard
30
+ After each pipeline phase, a terminal TUI presents flagged items (blurred images, low-confidence OCR, spatial mismatches, compliance warnings) for Accept/Edit/Defer review. Corrections write back to the workspace and update pipeline state for re-runnable workflows.
31
+
32
+ ### Offline Getty Vocabulary
33
+ Standardises materials, periods, and artefact types against Getty AAT/ULAN/TGN terms using the `heritage-vocab` library — works offline with a built-in fallback covering common archaeological terms. No API calls required.
34
+
35
+ ### Harris Matrix Generator
36
+ Pure-Python SVG stratigraphic matrix from context relationships. Colour-coded by period, arrows from later to earlier contexts. No graphviz or external tools needed.
37
+
38
+ ### Cryptographically Signed PDFs
39
+ Optional PAdES-B/LTV digital signatures via pyHanko for legally compliant report certification.
40
+
41
+ ### Cloud-Ready Credential Vault
42
+ API keys for OpenAI, Anthropic, and Google are stored encrypted at rest (AES-256-GCM + PBKDF2) and managed via `hoard keys set/list/remove`. Cross-compatible with the Kryptis vault format.
43
+
44
+ ### Ecosystem Integration
45
+ HOARD shares data contracts and workflows with [StratiGraph](https://github.com/mabo-du/stratigraph) (Harris Matrix viewer), Trowel (desktop report drafter), Libby (radiocarbon calibration), Cache & Carry (offline collections management), and Dibble (3D lithic analysis) — all accessible through the unified `heritage` CLI.
46
+
47
+ ## Quick Start
48
+
49
+ ```bash
50
+ # Install
51
+ pip install hoard # from PyPI
52
+ # or from source
53
+ git clone https://github.com/mabo-du/HOARD.git
54
+ cd HOARD && pip install -e ".[dev]"
55
+
56
+ # Install Ollama and pull models
57
+ ollama pull glm-ocr qwen3-vl:8b qwen3.5-4b gemma4
58
+
59
+ # Initialise a project
60
+ hoard init "Stoneyfield Farm 2026" --jurisdiction historic_england_cl3
61
+
62
+ # Run Phase 0 (no GPU needed)
63
+ hoard run --project stoneyfield_farm_2026 --input ./field_records --phase 0
64
+
65
+ # List available jurisdiction templates
66
+ hoard templates list
67
+ ```
68
+
69
+ ## CLI Reference
70
+
71
+ | Command | Description |
72
+ |---------|-------------|
73
+ | `hoard init <name>` | Initialise a new project |
74
+ | `hoard run --project <id>` | Run the pipeline (full or partial) |
75
+ | `hoard run --project <id> --phase <N>` | Run a single phase |
76
+ | `hoard run --project <id> --from-phase <N>` | Run from phase N onward |
77
+ | `hoard run --project <id> --strict` | Halt Phase 1 on schema validation failure |
78
+ | `hoard run --project <id> --extractor nuextract3` | Use NuExtract3 for Phase 1 extraction (opt-in) |
79
+ | `hoard import-ark --project <id> --input <dir>` | Import structured data from ARK system exports |
80
+ | `hoard review --project <id>` | Interactive review dashboard for flagged items |
81
+ | `hoard export --project <id> --format docx,pdf` | Export final report |
82
+ | `hoard templates list` | List available jurisdiction templates |
83
+ | `hoard templates show --name <code>` | Show template details with syntax highlighting |
84
+ | `hoard templates validate --file <path>` | Validate a template YAML file |
85
+ | `hoard keys set <provider> <key>` | Store an encrypted API key for cloud providers |
86
+ | `hoard keys list` | List configured API keys |
87
+ | `hoard keys unlock` | Unlock the credential vault |
88
+
89
+ ## Ecosystem Integration
90
+
91
+ HOARD is one component of a broader heritage science open-source ecosystem:
92
+
93
+ | Tool | Function | Integration |
94
+ |------|----------|-------------|
95
+ | **StratiGraph** | Interactive Harris Matrix editor (Tauri 2 + React) | [Shared JSON Schema](schemas/heritage-data-package-v1.json) — HOARD Phase 1 exports import directly |
96
+ | **Trowel** | Desktop report drafter (PyQt6) | Bidirectional JSON import/export, shared jurisdiction templates |
97
+ | **Libby** | Radiocarbon calibration (FastAPI + Svelte 5) | StratiGraph exports OxCal CQL / JSON payloads to Libby |
98
+ | **Cache & Carry** | Offline collections management (Tauri + Rust) | Getty AAT/ULAN/TGN vocabulary for term normalisation |
99
+ | **Dibble** | 3D lithic analysis (Python + PyVista) | Specialist finds appendix data via JSON bridge |
100
+ | **heritage-cli** | Unified ecosystem CLI | `heritage run/calibrate/lithics/review/matrix/publish` |
101
+
102
+ ## Jurisdiction Templates
103
+
104
+ Reports conform to national heritage authority standards via declarative YAML templates. Currently 14 jurisdictions:
105
+
106
+ | Code | Authority | Region |
107
+ |------|-----------|--------|
108
+ | `historic_england_cl3` | Historic England — Evaluation (CL3) | England |
109
+ | `historic_england_cl4` | Historic England — Excavation (CL4) | England |
110
+ | `historic_environment_scotland` | HES — Data Structure Report | Scotland |
111
+ | `wales_rcahmw` | Cadw / RCAHMW | Wales |
112
+ | `ireland_nms` | National Monuments Service | Ireland |
113
+ | `netherlands_kna` | KNA 5.0 | Netherlands |
114
+ | `france_inrap` | INRAP / Code du Patrimoine | France |
115
+ | `germany_denkmalpflege` | Landesdenkmalpflege | Germany |
116
+ | `us_section106` | Section 106 (NRHP) | United States |
117
+ | `canada_ontario` | Ontario S&G | Canada |
118
+ | `australia_burra` | Burra Charter / ICOMOS | Australia |
119
+ | `new_zealand` | Heritage NZ Pouhere Taonga | New Zealand |
120
+ | `south_africa_sahra` | SAHRA | South Africa |
121
+ | `international_generic` | Generic fallback | Any |
122
+
123
+ Adding a new jurisdiction means writing a single YAML file — no pipeline code changes required. Templates support `extends` inheritance for regional variations (e.g. US state-level overrides).
124
+
125
+ ## Documentation
126
+
127
+ - **[Full User Guide](docs/user-guide.md)** — installation, phase walkthroughs, ARK import, review dashboard, GPU setup, troubleshooting
128
+ - **`hoard --help`** — inline CLI reference
129
+ - **Research papers** — see `docs/research-papers/` for architectural deep-dives on multi-provider AI, ecosystem integration, schema unification, and model selection
130
+
131
+ ## Licence
132
+
133
+ MIT
134
+
135
+ ## Contributing
136
+
137
+ See [CONTRIBUTING.md](CONTRIBUTING.md) for development setup and pull request workflow.
@@ -0,0 +1,68 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "hoard-erd"
7
+ version = "0.2.1"
8
+ description = "Heritage Observation And Report Drafter — local multi-stage AI pipeline for archaeological grey literature reports"
9
+ requires-python = ">=3.11"
10
+ license = { text = "MIT" }
11
+ authors = [
12
+ { name = "Marcus Quinn" },
13
+ ]
14
+
15
+ dependencies = [
16
+ # CLI
17
+ "typer>=0.12",
18
+ "rich>=13",
19
+
20
+ # Image processing
21
+ "pillow>=10",
22
+ "opencv-python-headless>=4.9",
23
+ "wand>=0.6",
24
+
25
+ # Data
26
+ "pandas>=2",
27
+ "openpyxl>=3",
28
+ "pyyaml>=6",
29
+
30
+ # Shared data models (auto-generated from heritage-types TypeSpec)
31
+ "heritage-models>=1.0",
32
+
33
+ # OCR (loaded on demand)
34
+ # "transformers>=4.40", # TrOCR
35
+ # "torch>=2", # PyTorch
36
+
37
+ # Reporting
38
+ # pandoc is a system-level dependency, not pip
39
+ ]
40
+
41
+ [project.optional-dependencies]
42
+ dev = [
43
+ "pytest>=8",
44
+ "pytest-cov>=5",
45
+ "ruff>=0.4",
46
+ "mypy>=1",
47
+ ]
48
+ ark = [
49
+ "sentence-transformers>=3",
50
+ "numpy>=1.24",
51
+ ]
52
+ ocr = [
53
+ "transformers>=4.40",
54
+ "torch>=2",
55
+ "accelerate>=0.30",
56
+ ]
57
+ llm = [
58
+ "llama-cpp-python>=0.3",
59
+ ]
60
+ doc = [
61
+ "python-docx>=1",
62
+ ]
63
+
64
+ [project.scripts]
65
+ hoard = "hoard.cli.main:app"
66
+
67
+ [tool.setuptools.packages.find]
68
+ where = ["src"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,13 @@
1
+ """hoard — Heritage Observation And Report Drafter.
2
+
3
+ Multi-stage AI pipeline converting archaeological field data into
4
+ near-publication-ready grey literature reports. Fully local, targets
5
+ 6 GB VRAM, jurisdiction-templated.
6
+
7
+ exports: (package) — use hoard.cli.main:app as entry point
8
+ used_by: pyproject.toml → `hoard` CLI command
9
+ rules: No model inference logic in this file; orchestration only.
10
+ agent: deepseek-v4-flash | 2026-05-09 | s_20260509_001 | Initial scaffold
11
+ """
12
+
13
+ __version__ = "0.1.0"
@@ -0,0 +1,11 @@
1
+ """__main__.py — Allow `python -m hoard` as alternative to `hoard` CLI.
2
+
3
+ usage: python -m hoard --help
4
+ used_by: developer workflow
5
+ rules: Must delegate to hoard.cli.main:entry_point
6
+ agent: deepseek-v4-flash | 2026-05-09 | s_20260509_001 | Initial scaffold
7
+ """
8
+
9
+ from hoard.cli.main import entry_point
10
+
11
+ entry_point()
@@ -0,0 +1,24 @@
1
+ """hoard.ark — ARK system direct data input (bypasses Phase 1 OCR).
2
+
3
+ For digital-first excavations using the ARK (Archaeological Recording Kit)
4
+ system, structured data can be imported directly — bypassing Phase 0 file
5
+ ingestion and Phase 1 multi-modal digitisation.
6
+
7
+ exports: import_ark_export, ArkImportResult
8
+ used_by: hoard.cli.main → `hoard import-ark` command
9
+ rules: Must never import torch or any GPU-bound library. Exported data
10
+ must be compatible with Phase 5+ pipeline stages.
11
+ """
12
+
13
+ from hoard.ark.loader import ArkImportResult, import_ark_export
14
+ from hoard.ark.mapping import guess_mapping_from_header, transform_row
15
+ from hoard.ark.semantic_mapper import ArkSemanticMapper, map_headers_semantic
16
+
17
+ __all__ = [
18
+ "ArkImportResult",
19
+ "ArkSemanticMapper",
20
+ "import_ark_export",
21
+ "guess_mapping_from_header",
22
+ "map_headers_semantic",
23
+ "transform_row",
24
+ ]
@@ -0,0 +1,331 @@
1
+ """loader.py — ARK export data importer.
2
+
3
+ Discovers ARK system export files (CSV/JSON), maps fields to HOARD's
4
+ internal representation, and writes structured data directly into the
5
+ workspace — bypassing Phase 0 file ingestion and Phase 1 OCR for
6
+ digital-first excavations.
7
+
8
+ exports: import_ark_export, ArkImportResult
9
+ used_by: hoard.cli.main → `hoard import-ark` command
10
+ rules: Must never import torch or any GPU-bound library.
11
+ Generated manifests must be compatible with Phase 5+ pipeline stages.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import csv
17
+ import json
18
+ from datetime import datetime, timezone
19
+ from pathlib import Path
20
+ from typing import Any
21
+
22
+ from hoard.ark.mapping import (
23
+ ARK_CONTEXT_FIELDS,
24
+ ARK_DRAWINGS_FIELDS,
25
+ ARK_FINDS_FIELDS,
26
+ ARK_PHOTOS_FIELDS,
27
+ ARK_SAMPLES_FIELDS,
28
+ SOURCE_TYPE_MAP,
29
+ guess_mapping_from_header,
30
+ transform_row,
31
+ )
32
+ from hoard.config import Config
33
+ from hoard.workspace import PipelineState
34
+
35
+ # ── Data contracts ─────────────────────────────────────────────────────────
36
+
37
+ IMPORT_TABLES: dict[str, dict[str, Any]] = {
38
+ "context": {
39
+ "filename_patterns": ("context", "contexts", "ctx_register"),
40
+ "mapping": ARK_CONTEXT_FIELDS,
41
+ "output_key": "context_sheets",
42
+ },
43
+ "finds": {
44
+ "filename_patterns": ("finds", "small_finds", "finds_catalogue", "sf_register"),
45
+ "mapping": ARK_FINDS_FIELDS,
46
+ "output_key": "finds",
47
+ },
48
+ "samples": {
49
+ "filename_patterns": ("samples", "sample", "environmental"),
50
+ "mapping": ARK_SAMPLES_FIELDS,
51
+ "output_key": "samples",
52
+ },
53
+ "photos": {
54
+ "filename_patterns": ("photos", "photo", "images", "photo_log"),
55
+ "mapping": ARK_PHOTOS_FIELDS,
56
+ "output_key": "photos",
57
+ },
58
+ "drawings": {
59
+ "filename_patterns": ("drawings", "drawing", "plans", "section_drawings"),
60
+ "mapping": ARK_DRAWINGS_FIELDS,
61
+ "output_key": "drawings",
62
+ },
63
+ }
64
+
65
+
66
+ class ArkImportResult:
67
+ """Result of an ARK import operation."""
68
+
69
+ def __init__(self) -> None:
70
+ self.files_found: int = 0
71
+ self.files_parsed: int = 0
72
+ self.total_records: int = 0
73
+ self.errors: list[str] = []
74
+ self.warnings: list[str] = []
75
+ self.records_by_type: dict[str, int] = {}
76
+ self.manifest_path: Path | None = None
77
+ self.digitised_data: list[dict[str, Any]] = []
78
+
79
+ def to_dict(self) -> dict[str, Any]:
80
+ return {
81
+ "files_found": self.files_found,
82
+ "files_parsed": self.files_parsed,
83
+ "total_records": self.total_records,
84
+ "records_by_type": self.records_by_type,
85
+ "errors": self.errors,
86
+ "warnings": self.warnings,
87
+ "manifest_path": str(self.manifest_path) if self.manifest_path else None,
88
+ }
89
+
90
+
91
+ # ── File discovery ─────────────────────────────────────────────────────────
92
+
93
+
94
+ def _discover_ark_files(input_dir: Path) -> list[tuple[Path, str]]:
95
+ """Find ARK export files in input_dir.
96
+
97
+ Returns list of (file_path, source_type) tuples.
98
+ """
99
+ found: list[tuple[Path, str]] = []
100
+ if not input_dir.is_dir():
101
+ return found
102
+
103
+ for f in sorted(input_dir.iterdir()):
104
+ if not f.is_file() or f.name.startswith("."):
105
+ continue
106
+ stem = f.stem.lower()
107
+
108
+ for table_name, table_config in IMPORT_TABLES.items():
109
+ if stem in table_config["filename_patterns"]:
110
+ found.append((f, table_name))
111
+ break
112
+
113
+ return found
114
+
115
+
116
+ # ── CSV parsing ─────────────────────────────────────────────────────────────
117
+
118
+
119
+ def _parse_csv(filepath: Path) -> list[dict[str, Any]]:
120
+ """Parse a CSV file into a list of row dicts."""
121
+ rows: list[dict[str, Any]] = []
122
+ try:
123
+ with open(filepath, newline="", encoding="utf-8-sig") as f:
124
+ reader = csv.DictReader(f)
125
+ for row in reader:
126
+ cleaned = {k.strip(): v.strip() if v else "" for k, v in row.items()}
127
+ rows.append(cleaned)
128
+ except Exception as e:
129
+ raise ValueError(f"Failed to parse CSV {filepath}: {e}") from e
130
+ return rows
131
+
132
+
133
+ # ── JSON parsing ────────────────────────────────────────────────────────────
134
+
135
+
136
+ def _parse_json(filepath: Path) -> list[dict[str, Any]]:
137
+ """Parse a JSON file into a list of row dicts.
138
+
139
+ Handles both top-level arrays and {'data': [...]} wrappers.
140
+ """
141
+ raw = json.loads(filepath.read_text())
142
+ if isinstance(raw, list):
143
+ return raw
144
+ if isinstance(raw, dict):
145
+ # Common ARK JSON export wrappers
146
+ for key in ("data", "results", "records", "rows", "items"):
147
+ if key in raw and isinstance(raw[key], list):
148
+ return raw[key]
149
+ raise ValueError(f"Unrecognised JSON structure in {filepath}")
150
+
151
+
152
+ # ── Per-type processing ─────────────────────────────────────────────────────
153
+
154
+
155
+ def _process_table(
156
+ filepath: Path,
157
+ source_type: str,
158
+ mapping_table: list[tuple[str, str, str | None]],
159
+ ) -> tuple[list[dict[str, Any]], list[str]]:
160
+ """Parse and transform one ARK export file.
161
+
162
+ Returns (transformed_records, warnings).
163
+ """
164
+ warnings: list[str] = []
165
+
166
+ # Parse
167
+ if filepath.suffix.lower() == ".json":
168
+ raw_rows = _parse_json(filepath)
169
+ else:
170
+ raw_rows = _parse_csv(filepath)
171
+
172
+ if not raw_rows:
173
+ return [], [f"Empty file: {filepath.name}"]
174
+
175
+ # Infer mapping from first row's headers
176
+ header = list(raw_rows[0].keys())
177
+ field_map = guess_mapping_from_header(header, mapping_table)
178
+
179
+ if not field_map:
180
+ return [], [f"No recognised ARK fields in {filepath.name}"]
181
+
182
+ unrecognised = len(header) - len(field_map)
183
+ if unrecognised > 0:
184
+ warnings.append(f"{filepath.name}: {unrecognised} unrecognised column(s) ignored")
185
+
186
+ # Transform
187
+ records = [transform_row(row, field_map, source_type) for row in raw_rows]
188
+
189
+ return records, warnings
190
+
191
+
192
+ # ── Data-source manifest generation ─────────────────────────────────────────
193
+
194
+
195
+ def _build_manifest(
196
+ project_id: str,
197
+ records_by_type: dict[str, int],
198
+ errors: list[str],
199
+ warnings: list[str],
200
+ digitised_dir: Path,
201
+ ) -> dict[str, Any]:
202
+ """Build a Phase-0-compatible manifest for ARK-imported data."""
203
+ manifest: dict[str, Any] = {
204
+ "project_id": project_id,
205
+ "created": datetime.now(timezone.utc).isoformat(),
206
+ "import_method": "ark",
207
+ "ark_direct_input": True,
208
+ "phase0_bypassed": True,
209
+ "phase1_bypassed": True,
210
+ "total_records": sum(records_by_type.values()),
211
+ "records_by_type": records_by_type,
212
+ "files": [],
213
+ "mandatory_check": "PASS",
214
+ "missing_mandatory": [],
215
+ "quality_warnings": 0,
216
+ "finds_validation_issues": [],
217
+ "halt": False,
218
+ "import_errors": errors,
219
+ "import_warnings": warnings,
220
+ "digitised_dir": str(digitised_dir),
221
+ }
222
+
223
+ # Build synthetic file entries for the manifest (Phase 5 needs these)
224
+ for source_type, count in records_by_type.items():
225
+ hoard_type = SOURCE_TYPE_MAP.get(source_type, "unknown")
226
+ manifest["files"].append({
227
+ "id": f"ark_{source_type}",
228
+ "path": f"ark_import/{source_type}.csv",
229
+ "type": hoard_type,
230
+ "quality": {},
231
+ "ark_record_count": count,
232
+ })
233
+
234
+ return manifest
235
+
236
+
237
+ # ── Digitised data ──────────────────────────────────────────────────────────
238
+
239
+
240
+ def _write_digitised_data(
241
+ records: list[dict[str, Any]],
242
+ digitised_dir: Path,
243
+ prefix: str,
244
+ ) -> None:
245
+ """Write transformed records as digitised-phase-style JSON files.
246
+
247
+ Each record becomes its own JSON file (compatible with Phase 5's
248
+ expectation of per-record JSON in 01_digitised/).
249
+ """
250
+ digitised_dir.mkdir(parents=True, exist_ok=True)
251
+ for i, record in enumerate(records):
252
+ record_path = digitised_dir / f"{prefix}_{i:04d}.json"
253
+ record_path.write_text(json.dumps(record, indent=2))
254
+
255
+
256
+ # ── Main import ─────────────────────────────────────────────────────────────
257
+
258
+
259
+ def import_ark_export(config: Config) -> ArkImportResult:
260
+ """Import ARK system export data for a project.
261
+
262
+ Discovers ARK export files in config.input_dir, maps fields to HOARD's
263
+ internal representation, and writes structured data into the workspace.
264
+
265
+ Returns an ArkImportResult describing what was imported.
266
+ """
267
+ result = ArkImportResult()
268
+ project_dir = config.project_dir
269
+ manifest_dir = config.manifest_dir
270
+ digitised_dir = config.digitised_dir
271
+
272
+ project_dir.mkdir(parents=True, exist_ok=True)
273
+ manifest_dir.mkdir(parents=True, exist_ok=True)
274
+
275
+ # Discover ARK export files
276
+ ark_files = _discover_ark_files(config.input_dir)
277
+ result.files_found = len(ark_files)
278
+
279
+ if not ark_files:
280
+ result.errors.append("No ARK export files found")
281
+ return result
282
+
283
+ all_records: list[dict[str, Any]] = []
284
+ records_by_type: dict[str, int] = {}
285
+
286
+ for filepath, source_type in ark_files:
287
+ mapping_table = IMPORT_TABLES[source_type]["mapping"]
288
+ try:
289
+ records, warnings = _process_table(filepath, source_type, mapping_table)
290
+ except ValueError as e:
291
+ result.errors.append(str(e))
292
+ continue
293
+
294
+ result.warnings.extend(warnings)
295
+ result.files_parsed += 1
296
+
297
+ if records:
298
+ prefix = source_type
299
+ _write_digitised_data(records, digitised_dir, prefix)
300
+ all_records.extend(records)
301
+ records_by_type[source_type] = len(records)
302
+ result.total_records += len(records)
303
+
304
+ result.records_by_type = records_by_type
305
+ result.digitised_data = all_records
306
+
307
+ # Generate manifest
308
+ manifest = _build_manifest(
309
+ config.project_id,
310
+ records_by_type,
311
+ result.errors,
312
+ result.warnings,
313
+ digitised_dir,
314
+ )
315
+
316
+ manifest_path = manifest_dir / "manifest.json"
317
+ manifest_path.write_text(json.dumps(manifest, indent=2))
318
+ result.manifest_path = manifest_path
319
+
320
+ # Update pipeline state: mark Phase 0 and Phase 1 as bypassed
321
+ state = PipelineState(project_dir / "pipeline_state.json")
322
+ state.complete_phase(
323
+ 0,
324
+ summary=f"Bypassed via ARK import — {result.total_records} records from {result.files_parsed} files",
325
+ )
326
+ state.complete_phase(
327
+ 1,
328
+ summary="Bypassed via ARK import — digital-first excavation data already structured",
329
+ )
330
+
331
+ return result