carbonation 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. carbonation-0.0.1/.github/workflows/release.yml +26 -0
  2. carbonation-0.0.1/.github/workflows/test.yml +26 -0
  3. carbonation-0.0.1/.gitignore +139 -0
  4. carbonation-0.0.1/CLAUDE.md +110 -0
  5. carbonation-0.0.1/LICENSE +21 -0
  6. carbonation-0.0.1/PKG-INFO +354 -0
  7. carbonation-0.0.1/README.md +333 -0
  8. carbonation-0.0.1/examples/carbonation.service +33 -0
  9. carbonation-0.0.1/examples/config.toml +39 -0
  10. carbonation-0.0.1/examples/plugins/test_file_model.py +48 -0
  11. carbonation-0.0.1/examples/rules.toml +22 -0
  12. carbonation-0.0.1/pyproject.toml +50 -0
  13. carbonation-0.0.1/scripts/generate_fake_data.py +79 -0
  14. carbonation-0.0.1/src/carbonation/__init__.py +44 -0
  15. carbonation-0.0.1/src/carbonation/__main__.py +5 -0
  16. carbonation-0.0.1/src/carbonation/api.py +416 -0
  17. carbonation-0.0.1/src/carbonation/archive.py +140 -0
  18. carbonation-0.0.1/src/carbonation/cli.py +1231 -0
  19. carbonation-0.0.1/src/carbonation/config.py +395 -0
  20. carbonation-0.0.1/src/carbonation/db/__init__.py +0 -0
  21. carbonation-0.0.1/src/carbonation/db/engine.py +150 -0
  22. carbonation-0.0.1/src/carbonation/db/migrations/__init__.py +0 -0
  23. carbonation-0.0.1/src/carbonation/db/migrations/env.py +29 -0
  24. carbonation-0.0.1/src/carbonation/db/migrations/script.py.mako +26 -0
  25. carbonation-0.0.1/src/carbonation/db/migrations/versions/001_initial_schema.py +98 -0
  26. carbonation-0.0.1/src/carbonation/db/migrations/versions/__init__.py +0 -0
  27. carbonation-0.0.1/src/carbonation/db/models.py +223 -0
  28. carbonation-0.0.1/src/carbonation/db/queries.py +681 -0
  29. carbonation-0.0.1/src/carbonation/exceptions.py +17 -0
  30. carbonation-0.0.1/src/carbonation/log.py +39 -0
  31. carbonation-0.0.1/src/carbonation/plugins.py +128 -0
  32. carbonation-0.0.1/src/carbonation/reconcile.py +247 -0
  33. carbonation-0.0.1/src/carbonation/rules.py +199 -0
  34. carbonation-0.0.1/src/carbonation/service.py +848 -0
  35. carbonation-0.0.1/src/carbonation/watcher.py +131 -0
  36. carbonation-0.0.1/tests/__init__.py +0 -0
  37. carbonation-0.0.1/tests/conftest.py +126 -0
  38. carbonation-0.0.1/tests/test_api.py +278 -0
  39. carbonation-0.0.1/tests/test_archive.py +208 -0
  40. carbonation-0.0.1/tests/test_cli.py +445 -0
  41. carbonation-0.0.1/tests/test_config.py +302 -0
  42. carbonation-0.0.1/tests/test_db.py +437 -0
  43. carbonation-0.0.1/tests/test_integration.py +822 -0
  44. carbonation-0.0.1/tests/test_plugins.py +79 -0
  45. carbonation-0.0.1/tests/test_reconcile.py +186 -0
  46. carbonation-0.0.1/tests/test_rules.py +203 -0
  47. carbonation-0.0.1/tests/test_service.py +551 -0
  48. carbonation-0.0.1/tests/test_watcher.py +121 -0
  49. carbonation-0.0.1/uv.lock +804 -0
@@ -0,0 +1,26 @@
1
+ name: "Publish"
2
+
3
+ on:
4
+ release:
5
+ types: [published]
6
+
7
+ jobs:
8
+ publish:
9
+ runs-on: ubuntu-latest
10
+ environment:
11
+ name: pypi
12
+ permissions:
13
+ id-token: write
14
+ contents: read
15
+ steps:
16
+ - name: Checkout
17
+ uses: actions/checkout@v6
18
+
19
+ - name: Install uv
20
+ uses: astral-sh/setup-uv@v7
21
+
22
+ - name: Build
23
+ run: uv build
24
+
25
+ - name: Publish
26
+ run: uv publish
@@ -0,0 +1,26 @@
1
+ name: Test
2
+
3
+ on: [push, pull_request]
4
+
5
+ jobs:
6
+ build:
7
+ runs-on: ubuntu-latest
8
+ strategy:
9
+ fail-fast: false
10
+ matrix:
11
+ python-version: ["3.11", "3.12", "3.13"]
12
+
13
+ steps:
14
+ - uses: actions/checkout@v6
15
+
16
+ - name: Install uv and set the Python version
17
+ uses: astral-sh/setup-uv@v7
18
+ with:
19
+ enable-cache: true
20
+ python-version: ${{ matrix.python-version }}
21
+
22
+ - name: Install the project
23
+ run: uv sync --locked --all-extras --dev
24
+
25
+ - name: Run pytest
26
+ run: uv run pytest tests/
@@ -0,0 +1,139 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ pip-wheel-metadata/
24
+ share/python-wheels/
25
+ *.egg-info/
26
+ .installed.cfg
27
+ *.egg
28
+ MANIFEST
29
+ __about__.py
30
+
31
+ # PyInstaller
32
+ # Usually these files are written by a python script from a template
33
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
34
+ *.manifest
35
+ *.spec
36
+
37
+ # Installer logs
38
+ pip-log.txt
39
+ pip-delete-this-directory.txt
40
+
41
+ # Unit test / coverage reports
42
+ htmlcov/
43
+ .tox/
44
+ .nox/
45
+ .coverage
46
+ .coverage.*
47
+ .cache
48
+ nosetests.xml
49
+ coverage.xml
50
+ *.cover
51
+ *.py,cover
52
+ .hypothesis/
53
+ .pytest_cache/
54
+
55
+ # Translations
56
+ *.mo
57
+ *.pot
58
+
59
+ # Django stuff:
60
+ *.log
61
+ local_settings.py
62
+ db.sqlite3
63
+ db.sqlite3-journal
64
+
65
+ # Flask stuff:
66
+ instance/
67
+ .webassets-cache
68
+
69
+ # Scrapy stuff:
70
+ .scrapy
71
+
72
+ # Sphinx documentation
73
+ docs/_build/
74
+
75
+ # PyBuilder
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ .python-version
87
+
88
+ # pipenv
89
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
90
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
91
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
92
+ # install all needed dependencies.
93
+ #Pipfile.lock
94
+
95
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
96
+ __pypackages__/
97
+
98
+ # Celery stuff
99
+ celerybeat-schedule
100
+ celerybeat.pid
101
+
102
+ # SageMath parsed files
103
+ *.sage.py
104
+
105
+ # Environments
106
+ .env
107
+ .venv
108
+ env/
109
+ venv/
110
+ ENV/
111
+ env.bak/
112
+ venv.bak/
113
+
114
+ # Spyder project settings
115
+ .spyderproject
116
+ .spyproject
117
+
118
+ # Rope project settings
119
+ .ropeproject
120
+
121
+ # mkdocs documentation
122
+ /site
123
+
124
+ # mypy
125
+ .mypy_cache/
126
+ .dmypy.json
127
+ dmypy.json
128
+
129
+ # Pyre type checker
130
+ .pyre/
131
+
132
+ # hatch-vcs
133
+ _version.py
134
+
135
+ # ruff
136
+ .ruff_cache
137
+
138
+ # testing
139
+ test_data
@@ -0,0 +1,110 @@
1
+ # Carbonation
2
+
3
+ File delivery monitoring and archival service. Watches directories for incoming files, groups them by stem, validates integrity, extracts metadata via plugins, applies selection/routing rules, and archives to configurable destinations.
4
+
5
+ ## Quick Reference
6
+
7
+ ```bash
8
+ uv run pytest tests/ # all tests (~24s)
9
+ uv run pytest tests/ -m "not integration" # unit tests only (~2s)
10
+ uv run pytest tests/test_integration.py # integration tests only (~22s)
11
+ uv run ruff check src/ tests/ # lint
12
+ uv run ruff check --fix src/ tests/ # lint + autofix
13
+ ```
14
+
15
+ ## Project Layout
16
+
17
+ ```
18
+ src/carbonation/ # main package (src layout, built with hatchling)
19
+ cli.py # Click CLI: check-config, db init/status, run, reconcile
20
+ config.py # Pydantic models + TOML loading for config.toml and rules.toml
21
+ service.py # Main orchestrator: watcher -> rules -> archive pipeline
22
+ watcher.py # Watchdog integration with settle-timer debouncing
23
+ rules.py # Integrity, completeness, selection, routing evaluation
24
+ archive.py # File copy/move/hardlink/symlink with permissions
25
+ reconcile.py # Sync delivery/archive directories with DB
26
+ plugins.py # Plugin registry: loads .py files, indexes callables
27
+ log.py # Loguru configuration
28
+ exceptions.py # CarbonationError hierarchy
29
+ db/
30
+ models.py # SQLAlchemy ORM: FileComponent, FileBase (abstract)
31
+ queries.py # All DB operations (no auto-commit; callers commit)
32
+ engine.py # Engine/session factory, schema init/verify
33
+ examples/ # Example config.toml, rules.toml, plugin
34
+ scripts/ # generate_fake_data.py for test data
35
+ tests/ # pytest suite
36
+ ```
37
+
38
+ ## Architecture
39
+
40
+ **Processing pipeline** (in `service.py:_process_event_inner`):
41
+ 1. Record component (insert-first, handle IntegrityError for races)
42
+ 2. Integrity checks (min_size, readability)
43
+ 3. Completeness grouping (stem-based or plugin-based, with group locking)
44
+ 4. Metadata extraction (plugin `extract()` function)
45
+ 5. Selection rules (OR logic, has_keys / field matches / plugin)
46
+ 6. Routing (glob or plugin, first-match-wins)
47
+ 7. Archive (copy/move/hardlink/symlink with configurable permissions)
48
+
49
+ **Threading model**: Main thread runs event loop, ThreadPoolExecutor processes events, per-group locks (WeakValueDictionary) protect completeness checks, periodic Timer for reconciliation + completeness timeouts.
50
+
51
+ **Database**: Commits happen at caller boundaries, not in individual query functions. `_process_event` commits once after the full pipeline. `reconcile_delivery`/`reconcile_archive` commit at the end.
52
+
53
+ **Plugin system**: Plugins are `.py` files loaded from a configured directory. Must provide a `FileBase` subclass with `__tablename__ = "files"` for the File model, and an `extract(file_paths) -> dict` function for metadata. Domain columns on the File model must be nullable (populated after group creation).
54
+
55
+ ## Testing
56
+
57
+ - **Unit tests** (`test_*.py` except `test_integration.py`): Fast, in-memory SQLite, no filesystem watchers
58
+ - **Integration tests** (`test_integration.py`): Start real service in background thread, write files to disk, verify archive + DB state. Marked `@pytest.mark.integration`.
59
+ - **conftest.py** registers a `_TestFile(FileBase)` model at module level with both unit test fields (`label`) and integration test fields (`start`, `stop`, `category`, `source`). Integration test plugins must NOT define their own FileBase subclass — only provide `extract()`.
60
+ - Permission tests (`test_archive.py`) are skipped on Windows (`@pytest.mark.skipif(sys.platform == "win32")`)
61
+
62
+ ## Configuration
63
+
64
+ Two TOML files:
65
+ - **config.toml**: service settings, logging, database, plugins, watch directories, archive destinations
66
+ - **rules.toml**: integrity checks, completeness grouping, metadata module, selection rules, routing rules
67
+
68
+ Paths in config.toml are resolved relative to the config file's directory. `archive.destination` is resolved relative to `archive.base_path` when set.
69
+
70
+ **Hot-reload**: The service watches `rules.toml` for changes (debounced 0.5s). On modification it validates the new file, checks that all watch configs still reference valid rule sets, and atomically swaps `self.rules_config`. Invalid files or missing rule sets are rejected with a log warning — the service keeps running with the previous rules.
71
+
72
+ **Database credentials**: `DatabaseConfig` uses structured fields (`drivername`, `host`, `port`, `username`, `password`, `name`) instead of a raw URL string. The `url` property builds the SQLAlchemy URL via `URL.create()`. Credentials are layered: config.toml values take priority over `~/.config/carbonation/secrets.toml`. Secrets file uses the same `[database]` section format. SQLite `name` paths are resolved relative to the config file.
73
+
74
+ ## Reliability
75
+
76
+ - **Retry logic**: Archive failures mark components `RETRY_PENDING` instead of `FAILED`. Periodic checker re-enqueues them after `retry_delay`. After `max_retries` attempts, permanently marked `FAILED`. Manual retry via `carbonation retry`.
77
+ - **Post-archive verification**: File size compared after copy/move. Mismatch deletes the corrupt copy and raises an error (triggers retry).
78
+ - **Watch liveness**: Each reconciliation cycle verifies watch directories are present and readable.
79
+
80
+ ## Observability
81
+
82
+ - **Heartbeat**: `service_state` table updated on its own timer (`heartbeat_interval`, default `60s`) with `last_heartbeat`, `queue_depth`. Also pings the systemd watchdog (`WATCHDOG=1`) each cycle.
83
+ - **`carbonation status`**: Shows heartbeat age, component counts, watch directory health.
84
+ - **Stats logging**: Periodic summary logged each heartbeat: `Stats: archived=42, failed=2, queue_depth=0`.
85
+ - **Structured logging**: Set `format = "json"` in `[logging]` config for JSON lines output (loguru `serialize=True`).
86
+ - **`carbonation check-rules`**: Dry-run a candidate rules file against existing DB groups to preview selection/routing changes.
87
+
88
+ ## CLI Commands
89
+
90
+ ```
91
+ carbonation check-config # validate config + rules
92
+ carbonation db init # create database schema
93
+ carbonation db status # component counts, metadata check
94
+ carbonation run [--dry-run] [--once] # start the service
95
+ carbonation status # heartbeat, counts, watch health
96
+ carbonation retry [--watch-name X] # re-enqueue failed components
97
+ carbonation check-rules FILE # dry-run rules changes
98
+ carbonation reconcile delivery # sync delivery dirs with DB
99
+ carbonation reconcile archive # sync archive dirs with DB
100
+ carbonation query files [filters] # dynamic query with plugin columns
101
+ ```
102
+
103
+ ## Common Patterns
104
+
105
+ - All config models are Pydantic with field validators
106
+ - Duration strings: `"5s"`, `"1m"`, `"24h"`, `"7d"` (parsed by `config.parse_duration`)
107
+ - File permissions: `file_mode`/`dir_mode` as octal ints (default `0o440`/`0o550`), only applied on copy/move (not hardlink/symlink)
108
+ - `signal.signal()` is guarded for non-main-thread usage (needed for test harness)
109
+ - DB commits happen at caller boundaries, not in query functions
110
+ - Retry count tracked per-component; periodic checker increments on re-enqueue
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Jonathan Olsten
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,354 @@
1
+ Metadata-Version: 2.4
2
+ Name: carbonation
3
+ Version: 0.0.1
4
+ Summary: File delivery monitoring and archival service
5
+ Author-email: Jonathan Olsten <jonathan.olsten@gmail.com>
6
+ License-File: LICENSE
7
+ Requires-Python: >=3.11
8
+ Requires-Dist: alembic<2,>=1.13
9
+ Requires-Dist: click>=8.1
10
+ Requires-Dist: loguru>=0.7.3
11
+ Requires-Dist: pydantic<3,>=2.10
12
+ Requires-Dist: rich>=14.3.3
13
+ Requires-Dist: sdnotify<1,>=0.3
14
+ Requires-Dist: sqlalchemy<3,>=2.0.48
15
+ Requires-Dist: watchdog>=6.0.0
16
+ Provides-Extra: mariadb
17
+ Requires-Dist: mysqlclient<3,>=2.2; extra == 'mariadb'
18
+ Provides-Extra: postgresql
19
+ Requires-Dist: psycopg2<3,>=2.9; extra == 'postgresql'
20
+ Description-Content-Type: text/markdown
21
+
22
+ # Carbonation
23
+
24
+ File delivery monitoring and archival service. Watches directories for incoming files, groups them by stem, validates integrity, extracts metadata via plugins, applies selection and routing rules, and archives to configurable destinations.
25
+
26
+ ## Installation
27
+
28
+ Requires Python 3.11+.
29
+
30
+ ```bash
31
+ pip install carbonation
32
+ ```
33
+
34
+ For MariaDB or PostgreSQL support:
35
+
36
+ ```bash
37
+ pip install "carbonation[mariadb]" # mysqlclient
38
+ pip install "carbonation[postgresql]" # psycopg2
39
+ ```
40
+
41
+ ## Quick Start
42
+
43
+ ```bash
44
+ # 1. Scaffold a new project
45
+ mkdir /opt/carbonation && cd /opt/carbonation
46
+ carbonation init
47
+
48
+ # 2. Edit the generated files
49
+ $EDITOR config.toml # set watch paths, archive destination, database
50
+ $EDITOR rules.toml # configure integrity, completeness, selection rules
51
+ $EDITOR plugins/file_model.py # define your File model columns and extract()
52
+
53
+ # 3. Validate, initialize, and run
54
+ carbonation check-config # validate configuration
55
+ carbonation db init # create database tables
56
+ carbonation run # start the service
57
+ ```
58
+
59
+ `carbonation init` generates a complete starter project:
60
+
61
+ ```
62
+ config.toml # service, logging, database, watch, and archive config
63
+ rules.toml # integrity, completeness, metadata, selection, routing
64
+ plugins/file_model.py # File model + extract() function template
65
+ ```
66
+
67
+ Edit `plugins/file_model.py` to define the domain-specific columns for your use case and the `extract()` function that reads metadata from your files. See [Plugin System](#plugin-system) for details.
68
+
69
+ ## CLI Reference
70
+
71
+ ```
72
+ carbonation check-config Validate config.toml and rules.toml
73
+ carbonation db init Create schema (migrations + plugin columns)
74
+ carbonation db upgrade Apply pending schema migrations only
75
+ carbonation db status Component counts and metadata check
76
+ carbonation run [--dry-run] [--once] Start the service
77
+ carbonation status Heartbeat, status table, watch health
78
+ carbonation retry [--watch-name X] Re-enqueue failed components
79
+ carbonation check-rules FILE Dry-run rules changes against DB
80
+ carbonation reconcile delivery Sync delivery directories with DB
81
+ carbonation reconcile archive Sync archive directories with DB
82
+ carbonation query files [filters] Query with dynamic plugin filters
83
+ ```
84
+
85
+ ### Query Examples
86
+
87
+ ```bash
88
+ # Filter by plugin columns (dynamically generated from your model)
89
+ carbonation query files --category alpha --limit 50
90
+
91
+ # Date range overlap
92
+ carbonation query files --daterange 2026-01-01/2026-02-01
93
+
94
+ # Recent files by age
95
+ carbonation query files --age 7d
96
+
97
+ # Output formats
98
+ carbonation query files --category alpha --format json
99
+ carbonation query files --format csv > export.csv
100
+
101
+ # Count only
102
+ carbonation query files --category alpha --count
103
+
104
+ # Specific columns, sorted
105
+ carbonation query files --columns group_key,category,start --order-by -start
106
+ ```
107
+
108
+ ## Programmatic API
109
+
110
+ Query the database from Python scripts without going through the CLI:
111
+
112
+ ```python
113
+ from carbonation import connect
114
+
115
+ with connect("config.toml") as db:
116
+ # All files from a watch
117
+ rows = db.query_files({"watch_name": ["incoming"]}, limit=50)
118
+ for row in rows:
119
+ print(row["group_key"], row["created_at"])
120
+
121
+ # Count
122
+ n = db.query_files({"complete": True}, count_only=True)
123
+
124
+ # Date filtering + plugin columns
125
+ from datetime import datetime
126
+ rows = db.query_files({
127
+ "category": ["alpha"],
128
+ "created_at_after": datetime(2026, 1, 1),
129
+ }, order_by="-created_at", columns=["group_key", "category", "start"])
130
+ ```
131
+
132
+ For one-off scripts, use the convenience function that handles setup and teardown in one call:
133
+
134
+ ```python
135
+ from carbonation import query_files
136
+
137
+ rows = query_files("config.toml", {"category": ["alpha"]}, limit=50)
138
+ ```
139
+
140
+ ### Watermark-based polling
141
+
142
+ For external consumers (e.g. an orchestrator) that need to poll for newly completed files without gaps:
143
+
144
+ ```python
145
+ from carbonation import connect
146
+
147
+ cursor = None # persist this between invocations
148
+ with connect("config.toml") as db:
149
+ while True:
150
+ rows, cursor = db.get_new_files(cursor=cursor)
151
+ if not rows:
152
+ break
153
+ for row in rows:
154
+ print(row["group_key"], row["completed_at"])
155
+
156
+ # Check if the service is alive
157
+ status = db.get_service_status()
158
+ if status:
159
+ print(f"Last heartbeat: {status['last_heartbeat']}")
160
+ ```
161
+
162
+ ### ORM access
163
+
164
+ For direct ORM access to query, modify, and commit changes:
165
+
166
+ ```python
167
+ from carbonation import session
168
+ from carbonation.db.models import FileComponent, FileStatus
169
+
170
+ with session("config.toml") as s:
171
+ comps = s.query(FileComponent).filter_by(status=FileStatus.ARCHIVED).all()
172
+ for c in comps:
173
+ c.status = FileStatus.CLEARED
174
+ s.commit()
175
+ ```
176
+
177
+ ### Filter operators
178
+
179
+ | Suffix | Operator | Example |
180
+ |---|---|---|
181
+ | `_after` / `_before` | `>=` / `<=` (datetime) | `{"created_at_after": datetime(2026, 1, 1)}` |
182
+ | `_min` / `_max` | `>=` / `<=` (numeric) | `{"id_min": 100}` |
183
+ | `_daterange` | start/stop overlap | `{"_daterange": (start, end)}` |
184
+ | `_age` | recency | `{"_age": datetime(2026, 3, 20)}` |
185
+ | (none) | exact / IN / LIKE | `{"category": ["alpha", "beta"]}` |
186
+
187
+ ## Architecture
188
+
189
+ ### Processing Pipeline
190
+
191
+ ```mermaid
192
+ flowchart TD
193
+ A["File arrives in<br>watch directory"] --> B["Watchdog detects<br>creation / modification"]
194
+ B --> C["Settle timer<br>debounces"]
195
+ C --> D["Event enqueued"]
196
+ D --> E["Worker thread<br>picks up event"]
197
+
198
+ E --> S1["1 — Record component"]
199
+ S1 --> IC{{"2 — Integrity checks<br>(min_size, readability)"}}
200
+
201
+ IC -- fail --> IF["INTEGRITY_FAILED"]
202
+ IC -- pass --> S3{{"3 — Completeness<br>grouping"}}
203
+
204
+ S3 -- "incomplete<br>(waiting for extensions)" --> W["WAITING"]
205
+ S3 -- "timeout<br>(on_timeout=skip)" --> TO["TIMED_OUT"]
206
+ S3 -- "complete / standalone" --> S4["4 — Metadata extraction<br>(plugin extract)"]
207
+
208
+ S4 --> S5{{"5 — Selection rules"}}
209
+ S5 -- rejected --> NS["NOT_SELECTED"]
210
+ S5 -- accepted --> S6["6 — Routing<br>(choose archive)"]
211
+
212
+ S6 --> S7{{"7 — Archive<br>(copy / move / hardlink / symlink)"}}
213
+ S7 -- success --> AR["ARCHIVED"]
214
+ S7 -- "failure<br>(retries left)" --> RP["RETRY_PENDING"]
215
+ S7 -- "failure<br>(retries exhausted)" --> FA["FAILED"]
216
+
217
+ RP -. "retry_delay<br>elapsed" .-> E
218
+ IF -. "reconciliation<br>(re-assessed)" .-> E
219
+ NS -. "rules hot-reload<br>(re-evaluated)" .-> E
220
+
221
+ style IF fill:#d32f2f,color:#fff
222
+ style FA fill:#d32f2f,color:#fff
223
+ style TO fill:#757575,color:#fff
224
+ style NS fill:#757575,color:#fff
225
+ style W fill:#f9a825
226
+ style RP fill:#f57c00,color:#fff
227
+ style AR fill:#388e3c,color:#fff
228
+ ```
229
+
230
+ ### Reliability
231
+
232
+ - **Retry with backoff**: Archive failures are retried up to `max_retries` times with `retry_delay` between attempts. Only permanently failed after all retries exhausted.
233
+ - **Post-archive verification**: File size verified after copy/move. Mismatched files are deleted and retried.
234
+ - **Reconciliation**: Periodic sync between filesystem and database catches missed files, clears removed files, re-enqueues pending work.
235
+ - **Rules hot-reload**: The service watches `rules.toml` for changes and validates before applying. Invalid changes are rejected without disruption.
236
+
237
+ ### Plugin System
238
+
239
+ Plugins are `.py` files in the configured plugins directory. A plugin must provide:
240
+
241
+ - A `FileBase` subclass with `__tablename__ = "files"` defining domain-specific columns
242
+ - An `extract(file_paths: list[Path]) -> dict` function that returns metadata for the domain columns
243
+
244
+ Domain columns must be nullable since they are populated after the group row is created.
245
+
246
+ ### Configuration
247
+
248
+ - **config.toml**: Service settings, logging, database, plugins, watch directories, archive destinations
249
+ - **rules.toml**: Integrity checks, completeness grouping, metadata module, selection rules, routing rules
250
+
251
+ Paths are resolved relative to the config file's directory. `archive.destination` is resolved relative to `archive.base_path`.
252
+
253
+ Rules can be hot-reloaded by editing `rules.toml` while the service is running.
254
+
255
+ ## Deployment
256
+
257
+ ### Setting up a production install
258
+
259
+ Create a dedicated virtualenv so the service has an isolated, reproducible Python environment:
260
+
261
+ ```bash
262
+ # Create the project directory and venv
263
+ sudo mkdir -p /opt/carbonation
264
+ cd /opt/carbonation
265
+ python3 -m venv .venv
266
+
267
+ # Install carbonation into the venv
268
+ .venv/bin/pip install carbonation
269
+ # Add database drivers if needed:
270
+ # .venv/bin/pip install "carbonation[mariadb]"
271
+ # .venv/bin/pip install "carbonation[postgresql]"
272
+
273
+ # Scaffold config, rules, and plugin template
274
+ .venv/bin/carbonation init
275
+
276
+ # Edit to match your environment
277
+ $EDITOR config.toml rules.toml plugins/file_model.py
278
+
279
+ # Validate and initialize
280
+ .venv/bin/carbonation check-config
281
+ .venv/bin/carbonation db init
282
+ ```
283
+
284
+ ### systemd
285
+
286
+ Generate a systemd unit file during scaffolding:
287
+
288
+ ```bash
289
+ .venv/bin/carbonation init --systemd --user carbonation
290
+ ```
291
+
292
+ Then install and enable it:
293
+
294
+ ```bash
295
+ sudo cp carbonation.service /etc/systemd/system/
296
+ sudo systemctl daemon-reload
297
+ sudo systemctl enable --now carbonation
298
+ sudo systemctl status carbonation
299
+ ```
300
+
301
+ The generated unit file uses `Type=notify` with `WatchdogSec=300` — systemd waits for carbonation to signal readiness before marking it as started, and automatically restarts it if the watchdog heartbeat stops (e.g. hung process). Edit `ReadWritePaths=` in the unit file to include your delivery and archive directories.
302
+
303
+ `WatchdogSec` should be at least 2x your `heartbeat_interval` in `config.toml` (default: `60s`).
304
+
305
+ ### Cron
306
+
307
+ For environments where a persistent service is not needed, run carbonation as a cron job:
308
+
309
+ ```bash
310
+ */5 * * * * /opt/carbonation/.venv/bin/carbonation -c /opt/carbonation/config.toml run --once
311
+ ```
312
+
313
+ `--once` reconciles delivery directories, processes all pending events, and exits. It can also supplement a running service as defense-in-depth.
314
+
315
+ ### Upgrading
316
+
317
+ ```bash
318
+ cd /opt/carbonation
319
+ .venv/bin/pip install --upgrade carbonation
320
+ .venv/bin/carbonation -c config.toml db upgrade # apply schema migrations
321
+ .venv/bin/carbonation -c config.toml db init # sync plugin columns
322
+ sudo systemctl restart carbonation
323
+ ```
324
+
325
+ `db upgrade` applies pending Alembic migrations (carbonation's base tables). `db init` then adds any new plugin-defined columns. Both are idempotent.
326
+
327
+ ### Monitoring
328
+
329
+ ```bash
330
+ .venv/bin/carbonation -c /opt/carbonation/config.toml status
331
+ ```
332
+
333
+ The `status` command shows:
334
+ - Service heartbeat age (warns if stale)
335
+ - Component status breakdown by time window (1h/4h/1d/7d/30d)
336
+ - Delivery and archive totals
337
+ - Retry queue depth
338
+ - Oldest incomplete group
339
+ - Watch directory health
340
+
341
+ For log aggregation, set `format = "json"` in the logging config to produce structured JSON lines.
342
+
343
+ ## Development
344
+
345
+ ```bash
346
+ uv sync # install dependencies
347
+ uv run pytest tests/ # all tests (~50s)
348
+ uv run pytest tests/ -m "not integration" # unit tests only (~2s)
349
+ uv run ruff check src/ tests/ # lint
350
+ ```
351
+
352
+ ## License
353
+
354
+ MIT