llm-batch-annotate 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llm_batch_annotate-0.1.0/LICENSE +21 -0
- llm_batch_annotate-0.1.0/PKG-INFO +97 -0
- llm_batch_annotate-0.1.0/README.md +68 -0
- llm_batch_annotate-0.1.0/pyproject.toml +65 -0
- llm_batch_annotate-0.1.0/setup.cfg +4 -0
- llm_batch_annotate-0.1.0/src/llm_batch_annotate/__init__.py +193 -0
- llm_batch_annotate-0.1.0/src/llm_batch_annotate/_model.py +12 -0
- llm_batch_annotate-0.1.0/src/llm_batch_annotate/_version.py +3 -0
- llm_batch_annotate-0.1.0/src/llm_batch_annotate/artifacts/__init__.py +23 -0
- llm_batch_annotate-0.1.0/src/llm_batch_annotate/artifacts/local.py +86 -0
- llm_batch_annotate-0.1.0/src/llm_batch_annotate/artifacts/naming.py +103 -0
- llm_batch_annotate-0.1.0/src/llm_batch_annotate/builders/__init__.py +19 -0
- llm_batch_annotate-0.1.0/src/llm_batch_annotate/builders/assets.py +48 -0
- llm_batch_annotate-0.1.0/src/llm_batch_annotate/builders/base.py +154 -0
- llm_batch_annotate-0.1.0/src/llm_batch_annotate/builders/programmatic.py +41 -0
- llm_batch_annotate-0.1.0/src/llm_batch_annotate/builders/template.py +70 -0
- llm_batch_annotate-0.1.0/src/llm_batch_annotate/cli/__init__.py +31 -0
- llm_batch_annotate-0.1.0/src/llm_batch_annotate/cli/__main__.py +5 -0
- llm_batch_annotate-0.1.0/src/llm_batch_annotate/cli/main.py +483 -0
- llm_batch_annotate-0.1.0/src/llm_batch_annotate/configs/__init__.py +33 -0
- llm_batch_annotate-0.1.0/src/llm_batch_annotate/configs/models.py +158 -0
- llm_batch_annotate-0.1.0/src/llm_batch_annotate/contracts/__init__.py +39 -0
- llm_batch_annotate-0.1.0/src/llm_batch_annotate/contracts/base.py +263 -0
- llm_batch_annotate-0.1.0/src/llm_batch_annotate/contracts/records.py +167 -0
- llm_batch_annotate-0.1.0/src/llm_batch_annotate/enums.py +75 -0
- llm_batch_annotate-0.1.0/src/llm_batch_annotate/execution/__init__.py +22 -0
- llm_batch_annotate-0.1.0/src/llm_batch_annotate/execution/base.py +256 -0
- llm_batch_annotate-0.1.0/src/llm_batch_annotate/execution/providers/__init__.py +6 -0
- llm_batch_annotate-0.1.0/src/llm_batch_annotate/execution/providers/openai_batch.py +697 -0
- llm_batch_annotate-0.1.0/src/llm_batch_annotate/grouping/__init__.py +7 -0
- llm_batch_annotate-0.1.0/src/llm_batch_annotate/grouping/fixed_size.py +65 -0
- llm_batch_annotate-0.1.0/src/llm_batch_annotate/manifests/__init__.py +19 -0
- llm_batch_annotate-0.1.0/src/llm_batch_annotate/manifests/models.py +108 -0
- llm_batch_annotate-0.1.0/src/llm_batch_annotate/orchestration/__init__.py +10 -0
- llm_batch_annotate-0.1.0/src/llm_batch_annotate/orchestration/offline.py +139 -0
- llm_batch_annotate-0.1.0/src/llm_batch_annotate/orchestration/run.py +1042 -0
- llm_batch_annotate-0.1.0/src/llm_batch_annotate/parsers/__init__.py +7 -0
- llm_batch_annotate-0.1.0/src/llm_batch_annotate/parsers/base.py +318 -0
- llm_batch_annotate-0.1.0/src/llm_batch_annotate/parsers/structured.py +67 -0
- llm_batch_annotate-0.1.0/src/llm_batch_annotate/tasks/__init__.py +7 -0
- llm_batch_annotate-0.1.0/src/llm_batch_annotate/tasks/base.py +326 -0
- llm_batch_annotate-0.1.0/src/llm_batch_annotate/units/__init__.py +7 -0
- llm_batch_annotate-0.1.0/src/llm_batch_annotate/units/materialization.py +108 -0
- llm_batch_annotate-0.1.0/src/llm_batch_annotate/validation/__init__.py +8 -0
- llm_batch_annotate-0.1.0/src/llm_batch_annotate/validation/coverage.py +115 -0
- llm_batch_annotate-0.1.0/src/llm_batch_annotate.egg-info/PKG-INFO +97 -0
- llm_batch_annotate-0.1.0/src/llm_batch_annotate.egg-info/SOURCES.txt +66 -0
- llm_batch_annotate-0.1.0/src/llm_batch_annotate.egg-info/dependency_links.txt +1 -0
- llm_batch_annotate-0.1.0/src/llm_batch_annotate.egg-info/entry_points.txt +2 -0
- llm_batch_annotate-0.1.0/src/llm_batch_annotate.egg-info/requires.txt +9 -0
- llm_batch_annotate-0.1.0/src/llm_batch_annotate.egg-info/top_level.txt +1 -0
- llm_batch_annotate-0.1.0/tests/test_artifact_naming.py +48 -0
- llm_batch_annotate-0.1.0/tests/test_artifact_store.py +87 -0
- llm_batch_annotate-0.1.0/tests/test_builders.py +184 -0
- llm_batch_annotate-0.1.0/tests/test_cli.py +205 -0
- llm_batch_annotate-0.1.0/tests/test_configs.py +81 -0
- llm_batch_annotate-0.1.0/tests/test_contracts.py +254 -0
- llm_batch_annotate-0.1.0/tests/test_coverage_validation.py +60 -0
- llm_batch_annotate-0.1.0/tests/test_enums.py +18 -0
- llm_batch_annotate-0.1.0/tests/test_execution.py +197 -0
- llm_batch_annotate-0.1.0/tests/test_grouping.py +55 -0
- llm_batch_annotate-0.1.0/tests/test_imports.py +26 -0
- llm_batch_annotate-0.1.0/tests/test_manifests.py +104 -0
- llm_batch_annotate-0.1.0/tests/test_openai_batch_provider.py +396 -0
- llm_batch_annotate-0.1.0/tests/test_orchestration.py +314 -0
- llm_batch_annotate-0.1.0/tests/test_parsers.py +152 -0
- llm_batch_annotate-0.1.0/tests/test_tasks.py +142 -0
- llm_batch_annotate-0.1.0/tests/test_units.py +54 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Felipe Paula
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: llm-batch-annotate
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Batch-oriented LLM annotation workflows for tabular datasets with OpenAI Batch support.
|
|
5
|
+
Author: Felipe Paula
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/felipepaula/batch_api_annotate
|
|
8
|
+
Project-URL: Documentation, https://llm-batch-annotate.readthedocs.io/
|
|
9
|
+
Project-URL: Repository, https://github.com/felipepaula/batch_api_annotate
|
|
10
|
+
Project-URL: Issues, https://github.com/felipepaula/batch_api_annotate/issues
|
|
11
|
+
Keywords: annotation,batch,llm,openai,pydantic,tabular-data
|
|
12
|
+
Classifier: Development Status :: 3 - Alpha
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
17
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
18
|
+
Requires-Python: >=3.12
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
License-File: LICENSE
|
|
21
|
+
Requires-Dist: pydantic<3,>=2
|
|
22
|
+
Provides-Extra: test
|
|
23
|
+
Requires-Dist: pytest<9,>=8; extra == "test"
|
|
24
|
+
Provides-Extra: docs
|
|
25
|
+
Requires-Dist: furo>=2024.8.6; extra == "docs"
|
|
26
|
+
Requires-Dist: myst-parser<5,>=4; extra == "docs"
|
|
27
|
+
Requires-Dist: sphinx<9,>=8; extra == "docs"
|
|
28
|
+
Dynamic: license-file
|
|
29
|
+
|
|
30
|
+
# `llm-batch-annotate`
|
|
31
|
+
|
|
32
|
+
`llm-batch-annotate` is a Python package for running reproducible LLM annotation workflows over tabular datasets. It materializes units from source rows, groups them into provider requests, submits them through an execution adapter, parses structured outputs, validates coverage, and writes run artifacts for auditability.
|
|
33
|
+
|
|
34
|
+
## Highlights
|
|
35
|
+
|
|
36
|
+
- single-unit and grouped annotation workflows
|
|
37
|
+
- provider-agnostic task, builder, parser, and artifact abstractions
|
|
38
|
+
- concrete OpenAI Batch adapter
|
|
39
|
+
- resumable CLI-driven runs with persisted manifests
|
|
40
|
+
- example configs, prompts, schemas, and sample data under `examples/`
|
|
41
|
+
|
|
42
|
+
## Installation
|
|
43
|
+
|
|
44
|
+
When the package is published:
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
pip install llm-batch-annotate
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
From a local checkout:
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
python3 -m venv .venv
|
|
54
|
+
.venv/bin/pip install -e .[test,docs]
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
## Quickstart
|
|
58
|
+
|
|
59
|
+
Single-unit example:
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
export OPEN_AI_KEY="your-key"
|
|
63
|
+
llm-batch-annotate run examples/config/run_config.json --run-id example-single --no-poll-until-terminal
|
|
64
|
+
llm-batch-annotate resume examples/config/run_config.json example-single --poll-interval 2m
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
Grouped example:
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
export OPEN_AI_KEY="your-key"
|
|
71
|
+
llm-batch-annotate run examples/config/run_config_2.json --run-id example-grouped --no-poll-until-terminal
|
|
72
|
+
llm-batch-annotate resume examples/config/run_config_2.json example-grouped --poll-interval 2m
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
## Documentation
|
|
76
|
+
|
|
77
|
+
Project documentation is intended to be hosted on Read the Docs. The Sphinx source lives under `docs/`.
|
|
78
|
+
|
|
79
|
+
Planned public docs include:
|
|
80
|
+
|
|
81
|
+
- installation
|
|
82
|
+
- quickstart
|
|
83
|
+
- CLI reference
|
|
84
|
+
- config reference
|
|
85
|
+
- OpenAI Batch provider guide
|
|
86
|
+
- worked examples
|
|
87
|
+
- API reference
|
|
88
|
+
- development and release notes
|
|
89
|
+
|
|
90
|
+
## Repository layout
|
|
91
|
+
|
|
92
|
+
- `src/llm_batch_annotate/`: package source
|
|
93
|
+
- `examples/`: tracked example inputs and configs
|
|
94
|
+
- `tests/`: pytest suite
|
|
95
|
+
- `docs/`: Sphinx documentation source
|
|
96
|
+
|
|
97
|
+
Generated example runs are written to `examples/runs/` and are intentionally excluded from version control.
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
# `llm-batch-annotate`
|
|
2
|
+
|
|
3
|
+
`llm-batch-annotate` is a Python package for running reproducible LLM annotation workflows over tabular datasets. It materializes units from source rows, groups them into provider requests, submits them through an execution adapter, parses structured outputs, validates coverage, and writes run artifacts for auditability.
|
|
4
|
+
|
|
5
|
+
## Highlights
|
|
6
|
+
|
|
7
|
+
- single-unit and grouped annotation workflows
|
|
8
|
+
- provider-agnostic task, builder, parser, and artifact abstractions
|
|
9
|
+
- concrete OpenAI Batch adapter
|
|
10
|
+
- resumable CLI-driven runs with persisted manifests
|
|
11
|
+
- example configs, prompts, schemas, and sample data under `examples/`
|
|
12
|
+
|
|
13
|
+
## Installation
|
|
14
|
+
|
|
15
|
+
When the package is published:
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
pip install llm-batch-annotate
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
From a local checkout:
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
python3 -m venv .venv
|
|
25
|
+
.venv/bin/pip install -e .[test,docs]
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
## Quickstart
|
|
29
|
+
|
|
30
|
+
Single-unit example:
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
export OPEN_AI_KEY="your-key"
|
|
34
|
+
llm-batch-annotate run examples/config/run_config.json --run-id example-single --no-poll-until-terminal
|
|
35
|
+
llm-batch-annotate resume examples/config/run_config.json example-single --poll-interval 2m
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
Grouped example:
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
export OPEN_AI_KEY="your-key"
|
|
42
|
+
llm-batch-annotate run examples/config/run_config_2.json --run-id example-grouped --no-poll-until-terminal
|
|
43
|
+
llm-batch-annotate resume examples/config/run_config_2.json example-grouped --poll-interval 2m
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
## Documentation
|
|
47
|
+
|
|
48
|
+
Project documentation is intended to be hosted on Read the Docs. The Sphinx source lives under `docs/`.
|
|
49
|
+
|
|
50
|
+
Planned public docs include:
|
|
51
|
+
|
|
52
|
+
- installation
|
|
53
|
+
- quickstart
|
|
54
|
+
- CLI reference
|
|
55
|
+
- config reference
|
|
56
|
+
- OpenAI Batch provider guide
|
|
57
|
+
- worked examples
|
|
58
|
+
- API reference
|
|
59
|
+
- development and release notes
|
|
60
|
+
|
|
61
|
+
## Repository layout
|
|
62
|
+
|
|
63
|
+
- `src/llm_batch_annotate/`: package source
|
|
64
|
+
- `examples/`: tracked example inputs and configs
|
|
65
|
+
- `tests/`: pytest suite
|
|
66
|
+
- `docs/`: Sphinx documentation source
|
|
67
|
+
|
|
68
|
+
Generated example runs are written to `examples/runs/` and are intentionally excluded from version control.
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=69", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "llm-batch-annotate"
|
|
7
|
+
dynamic = ["version"]
|
|
8
|
+
description = "Batch-oriented LLM annotation workflows for tabular datasets with OpenAI Batch support."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.12"
|
|
11
|
+
license = "MIT"
|
|
12
|
+
license-files = ["LICENSE"]
|
|
13
|
+
authors = [
|
|
14
|
+
{name = "Felipe Paula"},
|
|
15
|
+
]
|
|
16
|
+
keywords = [
|
|
17
|
+
"annotation",
|
|
18
|
+
"batch",
|
|
19
|
+
"llm",
|
|
20
|
+
"openai",
|
|
21
|
+
"pydantic",
|
|
22
|
+
"tabular-data",
|
|
23
|
+
]
|
|
24
|
+
classifiers = [
|
|
25
|
+
"Development Status :: 3 - Alpha",
|
|
26
|
+
"Intended Audience :: Developers",
|
|
27
|
+
"Programming Language :: Python :: 3",
|
|
28
|
+
"Programming Language :: Python :: 3.12",
|
|
29
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
30
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
31
|
+
]
|
|
32
|
+
dependencies = [
|
|
33
|
+
"pydantic>=2,<3",
|
|
34
|
+
]
|
|
35
|
+
|
|
36
|
+
[project.urls]
|
|
37
|
+
Homepage = "https://github.com/felipepaula/batch_api_annotate"
|
|
38
|
+
Documentation = "https://llm-batch-annotate.readthedocs.io/"
|
|
39
|
+
Repository = "https://github.com/felipepaula/batch_api_annotate"
|
|
40
|
+
Issues = "https://github.com/felipepaula/batch_api_annotate/issues"
|
|
41
|
+
|
|
42
|
+
[project.optional-dependencies]
|
|
43
|
+
test = [
|
|
44
|
+
"pytest>=8,<9",
|
|
45
|
+
]
|
|
46
|
+
docs = [
|
|
47
|
+
"furo>=2024.8.6",
|
|
48
|
+
"myst-parser>=4,<5",
|
|
49
|
+
"sphinx>=8,<9",
|
|
50
|
+
]
|
|
51
|
+
|
|
52
|
+
[project.scripts]
|
|
53
|
+
llm-batch-annotate = "llm_batch_annotate.cli:main"
|
|
54
|
+
|
|
55
|
+
[tool.setuptools]
|
|
56
|
+
package-dir = {"" = "src"}
|
|
57
|
+
|
|
58
|
+
[tool.setuptools.dynamic]
|
|
59
|
+
version = {attr = "llm_batch_annotate._version.__version__"}
|
|
60
|
+
|
|
61
|
+
[tool.setuptools.packages.find]
|
|
62
|
+
where = ["src"]
|
|
63
|
+
|
|
64
|
+
[tool.pytest.ini_options]
|
|
65
|
+
testpaths = ["tests"]
|
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
"""Public package exports for llm_batch_annotate."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from ._version import __version__
|
|
6
|
+
from .artifacts.naming import (
|
|
7
|
+
ARTIFACT_REGISTRY,
|
|
8
|
+
ArtifactDefinition,
|
|
9
|
+
artifact_path,
|
|
10
|
+
artifact_ref,
|
|
11
|
+
artifact_refs_for_run,
|
|
12
|
+
artifact_relative_path,
|
|
13
|
+
get_artifact_definition,
|
|
14
|
+
run_directory,
|
|
15
|
+
)
|
|
16
|
+
from .artifacts.local import LocalArtifactStore
|
|
17
|
+
from .builders import (
|
|
18
|
+
BaseBuilder,
|
|
19
|
+
ProgrammaticBuilderBase,
|
|
20
|
+
PromptAssetBundle,
|
|
21
|
+
SimpleTemplateBuilder,
|
|
22
|
+
load_prompt_asset_text,
|
|
23
|
+
load_prompt_assets,
|
|
24
|
+
resolve_prompt_asset_path,
|
|
25
|
+
)
|
|
26
|
+
from .configs.models import (
|
|
27
|
+
ArtifactStoreConfig,
|
|
28
|
+
ArtifactStoreSelectionConfig,
|
|
29
|
+
BaseProviderConfig,
|
|
30
|
+
GenericProviderConfig,
|
|
31
|
+
GroupingConfig,
|
|
32
|
+
OpenAIBatchConfig,
|
|
33
|
+
OutputConfig,
|
|
34
|
+
PromptAssetsConfig,
|
|
35
|
+
ProviderConfig,
|
|
36
|
+
ProviderSelectionConfig,
|
|
37
|
+
RetryPolicyConfig,
|
|
38
|
+
RunConfig,
|
|
39
|
+
RunMetadataConfig,
|
|
40
|
+
SourceInputConfig,
|
|
41
|
+
)
|
|
42
|
+
from .contracts.base import ArtifactStore, BaseMessageBuilder, BaseParser, BaseTask, ExecutionProvider
|
|
43
|
+
from .contracts.records import (
|
|
44
|
+
AnnotationRecord,
|
|
45
|
+
ArtifactRef,
|
|
46
|
+
ComponentRef,
|
|
47
|
+
ExecutionHandle,
|
|
48
|
+
FailureRecord,
|
|
49
|
+
GroupRecord,
|
|
50
|
+
GroupMembershipRecord,
|
|
51
|
+
ParsedRequestRecord,
|
|
52
|
+
ProviderCapabilities,
|
|
53
|
+
RawErrorRecord,
|
|
54
|
+
RawOutputRecord,
|
|
55
|
+
RawResultRecord,
|
|
56
|
+
RequestRecord,
|
|
57
|
+
UnitRecord,
|
|
58
|
+
)
|
|
59
|
+
from .execution import (
|
|
60
|
+
ExecutionProviderBase,
|
|
61
|
+
OpenAIBatchProvider,
|
|
62
|
+
OpenAIBatchProviderError,
|
|
63
|
+
SUCCESSFUL_EXECUTION_STATUSES,
|
|
64
|
+
TERMINAL_EXECUTION_STATUSES,
|
|
65
|
+
is_successful_execution_status,
|
|
66
|
+
is_terminal_execution_status,
|
|
67
|
+
normalize_execution_status,
|
|
68
|
+
)
|
|
69
|
+
from .enums import (
|
|
70
|
+
ArtifactFormat,
|
|
71
|
+
ArtifactKind,
|
|
72
|
+
ArtifactStoreKind,
|
|
73
|
+
ExecutionStatus,
|
|
74
|
+
FailureKind,
|
|
75
|
+
GroupingStrategy,
|
|
76
|
+
ProviderKind,
|
|
77
|
+
RunStatus,
|
|
78
|
+
SourceFormat,
|
|
79
|
+
TaskKind,
|
|
80
|
+
)
|
|
81
|
+
from .manifests.models import (
|
|
82
|
+
ComponentIdentitySummary,
|
|
83
|
+
GroupingSummary,
|
|
84
|
+
InputSummary,
|
|
85
|
+
LineageSummary,
|
|
86
|
+
ParseSummary,
|
|
87
|
+
RunManifest,
|
|
88
|
+
ValidationSummary,
|
|
89
|
+
)
|
|
90
|
+
from .orchestration import OfflineTaskPipeline, OfflineTaskPipelineResult, TaskOrchestrator, TaskRunState, default_run_id
|
|
91
|
+
from .parsers import BaseOutputParser, StructuredOutputParser
|
|
92
|
+
from .tasks import ComposedTaskBase, GroupedTaskBase, SingleTaskBase
|
|
93
|
+
from .grouping.fixed_size import build_group_memberships, membership_map, plan_fixed_size_groups
|
|
94
|
+
from .units.materialization import derive_unit_id, materialize_units, validate_unique_unit_ids
|
|
95
|
+
from .validation.coverage import CoverageValidationResult, coverage_failures, validate_coverage, validate_group_coverage
|
|
96
|
+
|
|
97
|
+
__all__ = [
|
|
98
|
+
"ARTIFACT_REGISTRY",
|
|
99
|
+
"AnnotationRecord",
|
|
100
|
+
"ArtifactDefinition",
|
|
101
|
+
"ArtifactFormat",
|
|
102
|
+
"ArtifactKind",
|
|
103
|
+
"ArtifactRef",
|
|
104
|
+
"ArtifactStore",
|
|
105
|
+
"LocalArtifactStore",
|
|
106
|
+
"ArtifactStoreConfig",
|
|
107
|
+
"ArtifactStoreKind",
|
|
108
|
+
"ArtifactStoreSelectionConfig",
|
|
109
|
+
"BaseBuilder",
|
|
110
|
+
"BaseMessageBuilder",
|
|
111
|
+
"BaseOutputParser",
|
|
112
|
+
"BaseParser",
|
|
113
|
+
"BaseProviderConfig",
|
|
114
|
+
"BaseTask",
|
|
115
|
+
"ComposedTaskBase",
|
|
116
|
+
"ComponentIdentitySummary",
|
|
117
|
+
"ComponentRef",
|
|
118
|
+
"ExecutionHandle",
|
|
119
|
+
"ExecutionProvider",
|
|
120
|
+
"ExecutionProviderBase",
|
|
121
|
+
"ExecutionStatus",
|
|
122
|
+
"FailureKind",
|
|
123
|
+
"FailureRecord",
|
|
124
|
+
"GenericProviderConfig",
|
|
125
|
+
"GroupRecord",
|
|
126
|
+
"GroupMembershipRecord",
|
|
127
|
+
"GroupingConfig",
|
|
128
|
+
"GroupingStrategy",
|
|
129
|
+
"GroupingSummary",
|
|
130
|
+
"GroupedTaskBase",
|
|
131
|
+
"InputSummary",
|
|
132
|
+
"LineageSummary",
|
|
133
|
+
"OfflineTaskPipeline",
|
|
134
|
+
"OfflineTaskPipelineResult",
|
|
135
|
+
"OpenAIBatchConfig",
|
|
136
|
+
"OpenAIBatchProvider",
|
|
137
|
+
"OpenAIBatchProviderError",
|
|
138
|
+
"OutputConfig",
|
|
139
|
+
"ParseSummary",
|
|
140
|
+
"ParsedRequestRecord",
|
|
141
|
+
"ProgrammaticBuilderBase",
|
|
142
|
+
"PromptAssetBundle",
|
|
143
|
+
"PromptAssetsConfig",
|
|
144
|
+
"ProviderCapabilities",
|
|
145
|
+
"ProviderConfig",
|
|
146
|
+
"ProviderKind",
|
|
147
|
+
"ProviderSelectionConfig",
|
|
148
|
+
"RawErrorRecord",
|
|
149
|
+
"RawOutputRecord",
|
|
150
|
+
"RawResultRecord",
|
|
151
|
+
"RequestRecord",
|
|
152
|
+
"RetryPolicyConfig",
|
|
153
|
+
"RunConfig",
|
|
154
|
+
"RunManifest",
|
|
155
|
+
"RunMetadataConfig",
|
|
156
|
+
"RunStatus",
|
|
157
|
+
"SingleTaskBase",
|
|
158
|
+
"SimpleTemplateBuilder",
|
|
159
|
+
"SourceFormat",
|
|
160
|
+
"SourceInputConfig",
|
|
161
|
+
"StructuredOutputParser",
|
|
162
|
+
"SUCCESSFUL_EXECUTION_STATUSES",
|
|
163
|
+
"TaskKind",
|
|
164
|
+
"TaskOrchestrator",
|
|
165
|
+
"TaskRunState",
|
|
166
|
+
"TERMINAL_EXECUTION_STATUSES",
|
|
167
|
+
"UnitRecord",
|
|
168
|
+
"ValidationSummary",
|
|
169
|
+
"CoverageValidationResult",
|
|
170
|
+
"artifact_path",
|
|
171
|
+
"artifact_ref",
|
|
172
|
+
"artifact_refs_for_run",
|
|
173
|
+
"artifact_relative_path",
|
|
174
|
+
"build_group_memberships",
|
|
175
|
+
"coverage_failures",
|
|
176
|
+
"default_run_id",
|
|
177
|
+
"derive_unit_id",
|
|
178
|
+
"get_artifact_definition",
|
|
179
|
+
"load_prompt_asset_text",
|
|
180
|
+
"load_prompt_assets",
|
|
181
|
+
"materialize_units",
|
|
182
|
+
"membership_map",
|
|
183
|
+
"normalize_execution_status",
|
|
184
|
+
"plan_fixed_size_groups",
|
|
185
|
+
"resolve_prompt_asset_path",
|
|
186
|
+
"run_directory",
|
|
187
|
+
"is_successful_execution_status",
|
|
188
|
+
"is_terminal_execution_status",
|
|
189
|
+
"validate_coverage",
|
|
190
|
+
"validate_group_coverage",
|
|
191
|
+
"validate_unique_unit_ids",
|
|
192
|
+
"__version__",
|
|
193
|
+
]
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
from .naming import (
|
|
2
|
+
ARTIFACT_REGISTRY,
|
|
3
|
+
ArtifactDefinition,
|
|
4
|
+
artifact_path,
|
|
5
|
+
artifact_ref,
|
|
6
|
+
artifact_refs_for_run,
|
|
7
|
+
artifact_relative_path,
|
|
8
|
+
get_artifact_definition,
|
|
9
|
+
run_directory,
|
|
10
|
+
)
|
|
11
|
+
from .local import LocalArtifactStore
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"ARTIFACT_REGISTRY",
|
|
15
|
+
"ArtifactDefinition",
|
|
16
|
+
"LocalArtifactStore",
|
|
17
|
+
"artifact_path",
|
|
18
|
+
"artifact_ref",
|
|
19
|
+
"artifact_refs_for_run",
|
|
20
|
+
"artifact_relative_path",
|
|
21
|
+
"get_artifact_definition",
|
|
22
|
+
"run_directory",
|
|
23
|
+
]
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
"""Filesystem-backed artifact store implementation."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from ..configs.models import ArtifactStoreConfig
|
|
8
|
+
from ..contracts.base import ArtifactStore
|
|
9
|
+
from ..contracts.records import ArtifactRef
|
|
10
|
+
from ..enums import ArtifactFormat, ArtifactKind, ArtifactStoreKind
|
|
11
|
+
from ..manifests.models import RunManifest
|
|
12
|
+
from .naming import ARTIFACT_REGISTRY, artifact_path, artifact_ref, run_directory
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class LocalArtifactStore(ArtifactStore):
|
|
16
|
+
"""Persist run artifacts under a canonical local directory tree."""
|
|
17
|
+
|
|
18
|
+
def validate_config(self, config: ArtifactStoreConfig) -> None:
|
|
19
|
+
if config.kind is not ArtifactStoreKind.LOCAL:
|
|
20
|
+
msg = "LocalArtifactStore requires ArtifactStoreConfig.kind='local'"
|
|
21
|
+
raise ValueError(msg)
|
|
22
|
+
|
|
23
|
+
def run_path(self, run_id: str, config: ArtifactStoreConfig) -> Path:
|
|
24
|
+
self.validate_config(config)
|
|
25
|
+
return run_directory(run_id=run_id, runs_root=config.root_dir)
|
|
26
|
+
|
|
27
|
+
def artifact_path(self, run_id: str, artifact_kind: ArtifactKind, config: ArtifactStoreConfig) -> Path:
|
|
28
|
+
self.validate_config(config)
|
|
29
|
+
return artifact_path(run_id=run_id, artifact_kind=artifact_kind, runs_root=config.root_dir)
|
|
30
|
+
|
|
31
|
+
def initialize_run(self, run_id: str, config: ArtifactStoreConfig) -> Path:
|
|
32
|
+
run_path = self.run_path(run_id, config)
|
|
33
|
+
run_path.mkdir(parents=True, exist_ok=True)
|
|
34
|
+
|
|
35
|
+
for definition in ARTIFACT_REGISTRY.values():
|
|
36
|
+
(run_path / definition.relative_path.parent).mkdir(parents=True, exist_ok=True)
|
|
37
|
+
|
|
38
|
+
return run_path
|
|
39
|
+
|
|
40
|
+
def write_artifact(
|
|
41
|
+
self,
|
|
42
|
+
run_id: str,
|
|
43
|
+
artifact_kind: ArtifactKind,
|
|
44
|
+
content: str | bytes,
|
|
45
|
+
config: ArtifactStoreConfig,
|
|
46
|
+
) -> ArtifactRef:
|
|
47
|
+
artifact_file = self.artifact_path(run_id, artifact_kind, config)
|
|
48
|
+
artifact_file.parent.mkdir(parents=True, exist_ok=True)
|
|
49
|
+
|
|
50
|
+
if isinstance(content, bytes):
|
|
51
|
+
artifact_file.write_bytes(content)
|
|
52
|
+
else:
|
|
53
|
+
artifact_file.write_text(content, encoding="utf-8")
|
|
54
|
+
|
|
55
|
+
return artifact_ref(artifact_kind)
|
|
56
|
+
|
|
57
|
+
def read_artifact(
|
|
58
|
+
self,
|
|
59
|
+
run_id: str,
|
|
60
|
+
artifact_kind: ArtifactKind,
|
|
61
|
+
config: ArtifactStoreConfig,
|
|
62
|
+
) -> str | bytes:
|
|
63
|
+
artifact_file = self.artifact_path(run_id, artifact_kind, config)
|
|
64
|
+
artifact = artifact_ref(artifact_kind)
|
|
65
|
+
|
|
66
|
+
if artifact.format in {ArtifactFormat.JSON, ArtifactFormat.JSONL}:
|
|
67
|
+
return artifact_file.read_text(encoding="utf-8")
|
|
68
|
+
return artifact_file.read_bytes()
|
|
69
|
+
|
|
70
|
+
def resolve_artifact(
|
|
71
|
+
self,
|
|
72
|
+
run_id: str,
|
|
73
|
+
artifact_kind: ArtifactKind,
|
|
74
|
+
config: ArtifactStoreConfig,
|
|
75
|
+
) -> ArtifactRef:
|
|
76
|
+
_ = self.artifact_path(run_id, artifact_kind, config)
|
|
77
|
+
return artifact_ref(artifact_kind)
|
|
78
|
+
|
|
79
|
+
def write_manifest(self, manifest: RunManifest, config: ArtifactStoreConfig) -> ArtifactRef:
|
|
80
|
+
self.initialize_run(manifest.run_id, config)
|
|
81
|
+
return self.write_artifact(
|
|
82
|
+
manifest.run_id,
|
|
83
|
+
ArtifactKind.MANIFEST,
|
|
84
|
+
manifest.model_dump_json(indent=2),
|
|
85
|
+
config,
|
|
86
|
+
)
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from pathlib import Path, PurePosixPath
|
|
5
|
+
|
|
6
|
+
from ..contracts.records import ArtifactRef
|
|
7
|
+
from ..enums import ArtifactFormat, ArtifactKind
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass(frozen=True)
|
|
11
|
+
class ArtifactDefinition:
|
|
12
|
+
artifact_kind: ArtifactKind
|
|
13
|
+
format: ArtifactFormat
|
|
14
|
+
relative_path: PurePosixPath
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
ARTIFACT_REGISTRY: dict[ArtifactKind, ArtifactDefinition] = {
|
|
18
|
+
ArtifactKind.RUN_CONFIG: ArtifactDefinition(
|
|
19
|
+
artifact_kind=ArtifactKind.RUN_CONFIG,
|
|
20
|
+
format=ArtifactFormat.JSON,
|
|
21
|
+
relative_path=PurePosixPath("config/run_config.json"),
|
|
22
|
+
),
|
|
23
|
+
ArtifactKind.MANIFEST: ArtifactDefinition(
|
|
24
|
+
artifact_kind=ArtifactKind.MANIFEST,
|
|
25
|
+
format=ArtifactFormat.JSON,
|
|
26
|
+
relative_path=PurePosixPath("metadata/manifest.json"),
|
|
27
|
+
),
|
|
28
|
+
ArtifactKind.SUMMARY: ArtifactDefinition(
|
|
29
|
+
artifact_kind=ArtifactKind.SUMMARY,
|
|
30
|
+
format=ArtifactFormat.JSON,
|
|
31
|
+
relative_path=PurePosixPath("metadata/summary.json"),
|
|
32
|
+
),
|
|
33
|
+
ArtifactKind.UNITS: ArtifactDefinition(
|
|
34
|
+
artifact_kind=ArtifactKind.UNITS,
|
|
35
|
+
format=ArtifactFormat.JSONL,
|
|
36
|
+
relative_path=PurePosixPath("tables/units.jsonl"),
|
|
37
|
+
),
|
|
38
|
+
ArtifactKind.GROUPS: ArtifactDefinition(
|
|
39
|
+
artifact_kind=ArtifactKind.GROUPS,
|
|
40
|
+
format=ArtifactFormat.JSONL,
|
|
41
|
+
relative_path=PurePosixPath("tables/groups.jsonl"),
|
|
42
|
+
),
|
|
43
|
+
ArtifactKind.REQUESTS: ArtifactDefinition(
|
|
44
|
+
artifact_kind=ArtifactKind.REQUESTS,
|
|
45
|
+
format=ArtifactFormat.JSONL,
|
|
46
|
+
relative_path=PurePosixPath("tables/requests.jsonl"),
|
|
47
|
+
),
|
|
48
|
+
ArtifactKind.RAW_OUTPUTS: ArtifactDefinition(
|
|
49
|
+
artifact_kind=ArtifactKind.RAW_OUTPUTS,
|
|
50
|
+
format=ArtifactFormat.JSONL,
|
|
51
|
+
relative_path=PurePosixPath("raw/raw_outputs.jsonl"),
|
|
52
|
+
),
|
|
53
|
+
ArtifactKind.RAW_ERRORS: ArtifactDefinition(
|
|
54
|
+
artifact_kind=ArtifactKind.RAW_ERRORS,
|
|
55
|
+
format=ArtifactFormat.JSONL,
|
|
56
|
+
relative_path=PurePosixPath("raw/raw_errors.jsonl"),
|
|
57
|
+
),
|
|
58
|
+
ArtifactKind.PARSED_REQUESTS: ArtifactDefinition(
|
|
59
|
+
artifact_kind=ArtifactKind.PARSED_REQUESTS,
|
|
60
|
+
format=ArtifactFormat.JSONL,
|
|
61
|
+
relative_path=PurePosixPath("parsed/parsed_requests.jsonl"),
|
|
62
|
+
),
|
|
63
|
+
ArtifactKind.FLATTENED_ANNOTATIONS: ArtifactDefinition(
|
|
64
|
+
artifact_kind=ArtifactKind.FLATTENED_ANNOTATIONS,
|
|
65
|
+
format=ArtifactFormat.JSONL,
|
|
66
|
+
relative_path=PurePosixPath("parsed/flattened_annotations.jsonl"),
|
|
67
|
+
),
|
|
68
|
+
ArtifactKind.FAILURES: ArtifactDefinition(
|
|
69
|
+
artifact_kind=ArtifactKind.FAILURES,
|
|
70
|
+
format=ArtifactFormat.JSONL,
|
|
71
|
+
relative_path=PurePosixPath("parsed/failures.jsonl"),
|
|
72
|
+
),
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def get_artifact_definition(artifact_kind: ArtifactKind) -> ArtifactDefinition:
|
|
77
|
+
return ARTIFACT_REGISTRY[artifact_kind]
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def run_directory(run_id: str, runs_root: str | Path = "runs") -> Path:
|
|
81
|
+
return Path(runs_root) / run_id
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def artifact_relative_path(artifact_kind: ArtifactKind) -> PurePosixPath:
|
|
85
|
+
return get_artifact_definition(artifact_kind).relative_path
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def artifact_path(run_id: str, artifact_kind: ArtifactKind, runs_root: str | Path = "runs") -> Path:
|
|
89
|
+
return run_directory(run_id=run_id, runs_root=runs_root) / artifact_relative_path(artifact_kind)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def artifact_ref(artifact_kind: ArtifactKind) -> ArtifactRef:
|
|
93
|
+
definition = get_artifact_definition(artifact_kind)
|
|
94
|
+
return ArtifactRef(
|
|
95
|
+
artifact_kind=definition.artifact_kind,
|
|
96
|
+
format=definition.format,
|
|
97
|
+
relative_path=str(definition.relative_path),
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def artifact_refs_for_run(run_id: str, runs_root: str | Path = "runs") -> dict[ArtifactKind, ArtifactRef]:
|
|
102
|
+
_ = run_directory(run_id=run_id, runs_root=runs_root)
|
|
103
|
+
return {artifact_kind: artifact_ref(artifact_kind) for artifact_kind in ARTIFACT_REGISTRY}
|