docctl 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
docctl-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Georges Alkhouri
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,13 @@
1
+ include LICENSE
2
+ include README.md
3
+ include pyproject.toml
4
+ graft src
5
+ prune .github
6
+ prune benchmarks
7
+ prune build
8
+ prune dist
9
+ prune docs
10
+ prune scripts
11
+ prune tests
12
+ global-exclude __pycache__
13
+ global-exclude *.py[cod]
docctl-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,195 @@
1
+ Metadata-Version: 2.4
2
+ Name: docctl
3
+ Version: 0.1.0
4
+ Summary: CLI-first local document retrieval tool
5
+ License-Expression: MIT
6
+ Project-URL: Homepage, https://github.com/GeorgesAlkhouri/docctl
7
+ Project-URL: Repository, https://github.com/GeorgesAlkhouri/docctl
8
+ Project-URL: Issues, https://github.com/GeorgesAlkhouri/docctl/issues
9
+ Project-URL: Changelog, https://github.com/GeorgesAlkhouri/docctl/blob/main/CHANGELOG.md
10
+ Keywords: cli,documents,local-first,retrieval,search
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Environment :: Console
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.12
16
+ Classifier: Programming Language :: Python :: 3.13
17
+ Classifier: Topic :: Text Processing
18
+ Classifier: Topic :: Utilities
19
+ Requires-Python: <3.14,>=3.12
20
+ Description-Content-Type: text/markdown
21
+ License-File: LICENSE
22
+ Requires-Dist: chromadb>=0.5.23
23
+ Requires-Dist: llama-index-core>=0.11.23
24
+ Requires-Dist: python-docx
25
+ Requires-Dist: pdfplumber
26
+ Requires-Dist: pypdf
27
+ Requires-Dist: reportlab
28
+ Requires-Dist: sentence-transformers>=3.3.1
29
+ Requires-Dist: typer>=0.12.5
30
+ Dynamic: license-file
31
+
32
+ <p align="center">
33
+ <picture>
34
+ <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/GeorgesAlkhouri/docctl/main/docs/assets/docctl_logo_dark.png" />
35
+ <source media="(prefers-color-scheme: light)" srcset="https://raw.githubusercontent.com/GeorgesAlkhouri/docctl/main/docs/assets/docctl_logo_light.png" />
36
+ <img alt="docctl logo" src="https://raw.githubusercontent.com/GeorgesAlkhouri/docctl/main/docs/assets/docctl_logo_light.png" width="560" />
37
+ </picture>
38
+ </p>
39
+
40
+ <p align="center">
41
+ Local-first CLI for agent and human document retrieval with provenance-grounded answers,
42
+ local vector-store, and predictable machine-readable output.
43
+ </p>
44
+
45
+ <p align="center">
46
+ <a href="https://github.com/GeorgesAlkhouri/docctl/actions/workflows/ci.yml">
47
+ <img alt="CI" src="https://img.shields.io/github/actions/workflow/status/GeorgesAlkhouri/docctl/ci.yml?branch=main&style=for-the-badge&label=ci&logo=githubactions&logoColor=white" />
48
+ </a>
49
+ <a href="https://github.com/GeorgesAlkhouri/docctl/actions/workflows/security-trivy.yml">
50
+ <img alt="Trivy" src="https://img.shields.io/github/actions/workflow/status/GeorgesAlkhouri/docctl/security-trivy.yml?branch=main&style=for-the-badge&label=trivy&logo=githubactions&logoColor=white" />
51
+ </a>
52
+ <a href="https://sonarcloud.io/summary/new_code?id=GeorgesAlkhouri_docctl">
53
+ <img alt="Quality Gate" src="https://img.shields.io/sonar/quality_gate/GeorgesAlkhouri_docctl?server=https%3A%2F%2Fsonarcloud.io&style=for-the-badge&label=quality%20gate&logo=sonar&logoColor=white" />
54
+ </a>
55
+ <a href="https://codecov.io/gh/GeorgesAlkhouri/docctl">
56
+ <img alt="Codecov" src="https://img.shields.io/codecov/c/github/GeorgesAlkhouri/docctl?style=for-the-badge&logo=codecov&logoColor=white&label=codecov" />
57
+ </a>
58
+ </p>
59
+
60
+ <p align="center">
61
+ <a href="https://www.python.org/downloads/">
62
+ <img alt="Python 3.12 | 3.13" src="https://img.shields.io/badge/python-3.12%20%7C%203.13-3776AB?style=for-the-badge&logo=python&logoColor=white" />
63
+ </a>
64
+ <a href="https://github.com/GeorgesAlkhouri/docctl">
65
+ <img alt="Local-first" src="https://img.shields.io/badge/local--first-no%20cloud%20required-2EA44F?style=for-the-badge&logo=homeassistant&logoColor=white" />
66
+ </a>
67
+ <a href="https://docs.trychroma.com/">
68
+ <img alt="Chroma" src="https://img.shields.io/badge/chroma-vector%20store-FF6F00?style=for-the-badge&logo=sqlite&logoColor=white" />
69
+ </a>
70
+ </p>
71
+
72
+ ## Why docctl
73
+ - Optimized for agentic retrieval loops with fast multi-step questions and answers.
74
+ - Runs locally with a persistent Chroma-backed index.
75
+ - Ingests `.pdf`, `.docx`, `.txt`, and `.md` with provenance metadata (`doc_id`, `source`, `title`).
76
+ - Uses sentence-aware chunking for better retrieval quality.
77
+ - Supports deterministic `--json` output for automation and agents.
78
+ - Exposes stable CLI workflows for ingest, search, diagnostics, and inventory.
79
+
80
+ ## Agent Integration
81
+ Use [SKILL.md](https://github.com/GeorgesAlkhouri/docctl/blob/main/SKILL.md) when you want an agent to drive `docctl` end-to-end.
82
+ The skill makes `session` for fast iterative retrieval.
83
+
84
+ ## Quickstart
85
+ Requirements:
86
+ - Python 3.12 or 3.13
87
+ - `uv`
88
+
89
+ ```bash
90
+ # 1) Install dependencies
91
+ uv sync --frozen --dev
92
+
93
+ # 2) Verify CLI
94
+ uv run docctl --help
95
+
96
+ # 3) Ingest supported files
97
+ uv run docctl ingest ./docs --recursive --approve-write --allow-model-download
98
+
99
+ # 4) Search indexed content
100
+ uv run docctl search "security gateway diagnostics" --top-k 5 --allow-model-download
101
+
102
+ # 5) Show one chunk by id (replace with an id from search output)
103
+ uv run docctl show <chunk_id_from_search> --allow-model-download
104
+ ```
105
+
106
+ After the first public release, install from PyPI with:
107
+
108
+ ```bash
109
+ pip install docctl
110
+ ```
111
+
112
+ ## Command Overview
113
+ | Command | Purpose |
114
+ |---|---|
115
+ | `docctl ingest <path>` | Ingest one supported file or a directory of supported files (mutates local index state). |
116
+ | `docctl search <query>` | Search indexed content with optional metadata filters. |
117
+ | `docctl show <chunk_id>` | Show one indexed chunk by exact id. |
118
+ | `docctl stats` | Show index statistics. |
119
+ | `docctl catalog` | Show index summary and per-document inventory. |
120
+ | `docctl doctor` | Run local diagnostics for index and embedding setup. |
121
+ | `docctl session` | Run a read-only NDJSON request session on stdin/stdout. |
122
+
123
+ ## JSON and Session Mode
124
+ Use `--json` for deterministic machine-readable output:
125
+
126
+ ```bash
127
+ uv run docctl --json search "security gateway diagnostics" --top-k 5 --allow-model-download
128
+ ```
129
+
130
+ Use `session` for NDJSON request/response flows. For agents, this is the preferred fast path whenever one workflow needs two or more read operations:
131
+
132
+ ```bash
133
+ cat <<'EOF' | uv run docctl session --allow-model-download
134
+ {"id":"q1","op":"search","query":"security gateway diagnostics","top_k":5}
135
+ {"id":"q2","op":"catalog"}
136
+ EOF
137
+ ```
138
+
139
+ ## Configuration
140
+ Global options:
141
+ - `--index-path` (default: `.docctl`)
142
+ - `--collection` (default: `default`)
143
+ - `--json` (deterministic JSON payloads on stdout)
144
+ - `--verbose` (extra diagnostics)
145
+
146
+ Model downloads are explicit:
147
+ - Use `--allow-model-download` when embedding artifacts are not already available.
148
+
149
+ Mutation boundaries:
150
+ - `ingest` is mutating.
151
+ - `search`, `show`, `stats`, `catalog`, `doctor`, and `session` are read-only.
152
+
153
+ ## Development
154
+ Run core quality checks:
155
+
156
+ ```bash
157
+ make lint
158
+ make format-check
159
+ make typecheck
160
+ make security-lint
161
+ make import-lint
162
+ make test
163
+ make test-cov
164
+ make check-markdown-links
165
+ ```
166
+
167
+ Apply formatting fixes:
168
+
169
+ ```bash
170
+ make format
171
+ ```
172
+
173
+ Build release artifacts locally:
174
+
175
+ ```bash
176
+ make build-dist
177
+ make check-dist
178
+ make release-dry-run
179
+ ```
180
+
181
+ ## Documentation Map
182
+ - [ARCHITECTURE.md](https://github.com/GeorgesAlkhouri/docctl/blob/main/ARCHITECTURE.md)
183
+ - [docs/design-docs/index.md](https://github.com/GeorgesAlkhouri/docctl/blob/main/docs/design-docs/index.md)
184
+ - [docs/product-specs/index.md](https://github.com/GeorgesAlkhouri/docctl/blob/main/docs/product-specs/index.md)
185
+ - [docs/references/index.md](https://github.com/GeorgesAlkhouri/docctl/blob/main/docs/references/index.md)
186
+ - [SECURITY.md](https://github.com/GeorgesAlkhouri/docctl/blob/main/SECURITY.md) (canonical vulnerability disclosure policy)
187
+ - [docs/RELIABILITY.md](https://github.com/GeorgesAlkhouri/docctl/blob/main/docs/RELIABILITY.md)
188
+ - [docs/SECURITY.md](https://github.com/GeorgesAlkhouri/docctl/blob/main/docs/SECURITY.md) (internal implementation security guardrails)
189
+ - [docs/PLANS.md](https://github.com/GeorgesAlkhouri/docctl/blob/main/docs/PLANS.md)
190
+
191
+ ## Contributing
192
+ For implementation and validation workflow, start with:
193
+ 1. [AGENTS.md](https://github.com/GeorgesAlkhouri/docctl/blob/main/AGENTS.md)
194
+ 2. [ARCHITECTURE.md](https://github.com/GeorgesAlkhouri/docctl/blob/main/ARCHITECTURE.md)
195
+ 3. The indexed docs under `docs/` listed above.
docctl-0.1.0/README.md ADDED
@@ -0,0 +1,164 @@
1
+ <p align="center">
2
+ <picture>
3
+ <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/GeorgesAlkhouri/docctl/main/docs/assets/docctl_logo_dark.png" />
4
+ <source media="(prefers-color-scheme: light)" srcset="https://raw.githubusercontent.com/GeorgesAlkhouri/docctl/main/docs/assets/docctl_logo_light.png" />
5
+ <img alt="docctl logo" src="https://raw.githubusercontent.com/GeorgesAlkhouri/docctl/main/docs/assets/docctl_logo_light.png" width="560" />
6
+ </picture>
7
+ </p>
8
+
9
+ <p align="center">
10
+ Local-first CLI for agent and human document retrieval with provenance-grounded answers,
11
+ local vector-store, and predictable machine-readable output.
12
+ </p>
13
+
14
+ <p align="center">
15
+ <a href="https://github.com/GeorgesAlkhouri/docctl/actions/workflows/ci.yml">
16
+ <img alt="CI" src="https://img.shields.io/github/actions/workflow/status/GeorgesAlkhouri/docctl/ci.yml?branch=main&style=for-the-badge&label=ci&logo=githubactions&logoColor=white" />
17
+ </a>
18
+ <a href="https://github.com/GeorgesAlkhouri/docctl/actions/workflows/security-trivy.yml">
19
+ <img alt="Trivy" src="https://img.shields.io/github/actions/workflow/status/GeorgesAlkhouri/docctl/security-trivy.yml?branch=main&style=for-the-badge&label=trivy&logo=githubactions&logoColor=white" />
20
+ </a>
21
+ <a href="https://sonarcloud.io/summary/new_code?id=GeorgesAlkhouri_docctl">
22
+ <img alt="Quality Gate" src="https://img.shields.io/sonar/quality_gate/GeorgesAlkhouri_docctl?server=https%3A%2F%2Fsonarcloud.io&style=for-the-badge&label=quality%20gate&logo=sonar&logoColor=white" />
23
+ </a>
24
+ <a href="https://codecov.io/gh/GeorgesAlkhouri/docctl">
25
+ <img alt="Codecov" src="https://img.shields.io/codecov/c/github/GeorgesAlkhouri/docctl?style=for-the-badge&logo=codecov&logoColor=white&label=codecov" />
26
+ </a>
27
+ </p>
28
+
29
+ <p align="center">
30
+ <a href="https://www.python.org/downloads/">
31
+ <img alt="Python 3.12 | 3.13" src="https://img.shields.io/badge/python-3.12%20%7C%203.13-3776AB?style=for-the-badge&logo=python&logoColor=white" />
32
+ </a>
33
+ <a href="https://github.com/GeorgesAlkhouri/docctl">
34
+ <img alt="Local-first" src="https://img.shields.io/badge/local--first-no%20cloud%20required-2EA44F?style=for-the-badge&logo=homeassistant&logoColor=white" />
35
+ </a>
36
+ <a href="https://docs.trychroma.com/">
37
+ <img alt="Chroma" src="https://img.shields.io/badge/chroma-vector%20store-FF6F00?style=for-the-badge&logo=sqlite&logoColor=white" />
38
+ </a>
39
+ </p>
40
+
41
+ ## Why docctl
42
+ - Optimized for agentic retrieval loops with fast multi-step questions and answers.
43
+ - Runs locally with a persistent Chroma-backed index.
44
+ - Ingests `.pdf`, `.docx`, `.txt`, and `.md` with provenance metadata (`doc_id`, `source`, `title`).
45
+ - Uses sentence-aware chunking for better retrieval quality.
46
+ - Supports deterministic `--json` output for automation and agents.
47
+ - Exposes stable CLI workflows for ingest, search, diagnostics, and inventory.
48
+
49
+ ## Agent Integration
50
+ Use [SKILL.md](https://github.com/GeorgesAlkhouri/docctl/blob/main/SKILL.md) when you want an agent to drive `docctl` end-to-end.
51
+ The skill makes `session` for fast iterative retrieval.
52
+
53
+ ## Quickstart
54
+ Requirements:
55
+ - Python 3.12 or 3.13
56
+ - `uv`
57
+
58
+ ```bash
59
+ # 1) Install dependencies
60
+ uv sync --frozen --dev
61
+
62
+ # 2) Verify CLI
63
+ uv run docctl --help
64
+
65
+ # 3) Ingest supported files
66
+ uv run docctl ingest ./docs --recursive --approve-write --allow-model-download
67
+
68
+ # 4) Search indexed content
69
+ uv run docctl search "security gateway diagnostics" --top-k 5 --allow-model-download
70
+
71
+ # 5) Show one chunk by id (replace with an id from search output)
72
+ uv run docctl show <chunk_id_from_search> --allow-model-download
73
+ ```
74
+
75
+ After the first public release, install from PyPI with:
76
+
77
+ ```bash
78
+ pip install docctl
79
+ ```
80
+
81
+ ## Command Overview
82
+ | Command | Purpose |
83
+ |---|---|
84
+ | `docctl ingest <path>` | Ingest one supported file or a directory of supported files (mutates local index state). |
85
+ | `docctl search <query>` | Search indexed content with optional metadata filters. |
86
+ | `docctl show <chunk_id>` | Show one indexed chunk by exact id. |
87
+ | `docctl stats` | Show index statistics. |
88
+ | `docctl catalog` | Show index summary and per-document inventory. |
89
+ | `docctl doctor` | Run local diagnostics for index and embedding setup. |
90
+ | `docctl session` | Run a read-only NDJSON request session on stdin/stdout. |
91
+
92
+ ## JSON and Session Mode
93
+ Use `--json` for deterministic machine-readable output:
94
+
95
+ ```bash
96
+ uv run docctl --json search "security gateway diagnostics" --top-k 5 --allow-model-download
97
+ ```
98
+
99
+ Use `session` for NDJSON request/response flows. For agents, this is the preferred fast path whenever one workflow needs two or more read operations:
100
+
101
+ ```bash
102
+ cat <<'EOF' | uv run docctl session --allow-model-download
103
+ {"id":"q1","op":"search","query":"security gateway diagnostics","top_k":5}
104
+ {"id":"q2","op":"catalog"}
105
+ EOF
106
+ ```
107
+
108
+ ## Configuration
109
+ Global options:
110
+ - `--index-path` (default: `.docctl`)
111
+ - `--collection` (default: `default`)
112
+ - `--json` (deterministic JSON payloads on stdout)
113
+ - `--verbose` (extra diagnostics)
114
+
115
+ Model downloads are explicit:
116
+ - Use `--allow-model-download` when embedding artifacts are not already available.
117
+
118
+ Mutation boundaries:
119
+ - `ingest` is mutating.
120
+ - `search`, `show`, `stats`, `catalog`, `doctor`, and `session` are read-only.
121
+
122
+ ## Development
123
+ Run core quality checks:
124
+
125
+ ```bash
126
+ make lint
127
+ make format-check
128
+ make typecheck
129
+ make security-lint
130
+ make import-lint
131
+ make test
132
+ make test-cov
133
+ make check-markdown-links
134
+ ```
135
+
136
+ Apply formatting fixes:
137
+
138
+ ```bash
139
+ make format
140
+ ```
141
+
142
+ Build release artifacts locally:
143
+
144
+ ```bash
145
+ make build-dist
146
+ make check-dist
147
+ make release-dry-run
148
+ ```
149
+
150
+ ## Documentation Map
151
+ - [ARCHITECTURE.md](https://github.com/GeorgesAlkhouri/docctl/blob/main/ARCHITECTURE.md)
152
+ - [docs/design-docs/index.md](https://github.com/GeorgesAlkhouri/docctl/blob/main/docs/design-docs/index.md)
153
+ - [docs/product-specs/index.md](https://github.com/GeorgesAlkhouri/docctl/blob/main/docs/product-specs/index.md)
154
+ - [docs/references/index.md](https://github.com/GeorgesAlkhouri/docctl/blob/main/docs/references/index.md)
155
+ - [SECURITY.md](https://github.com/GeorgesAlkhouri/docctl/blob/main/SECURITY.md) (canonical vulnerability disclosure policy)
156
+ - [docs/RELIABILITY.md](https://github.com/GeorgesAlkhouri/docctl/blob/main/docs/RELIABILITY.md)
157
+ - [docs/SECURITY.md](https://github.com/GeorgesAlkhouri/docctl/blob/main/docs/SECURITY.md) (internal implementation security guardrails)
158
+ - [docs/PLANS.md](https://github.com/GeorgesAlkhouri/docctl/blob/main/docs/PLANS.md)
159
+
160
+ ## Contributing
161
+ For implementation and validation workflow, start with:
162
+ 1. [AGENTS.md](https://github.com/GeorgesAlkhouri/docctl/blob/main/AGENTS.md)
163
+ 2. [ARCHITECTURE.md](https://github.com/GeorgesAlkhouri/docctl/blob/main/ARCHITECTURE.md)
164
+ 3. The indexed docs under `docs/` listed above.
@@ -0,0 +1,152 @@
1
+ [build-system]
2
+ requires = ["setuptools>=82", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "docctl"
7
+ version = "0.1.0"
8
+ description = "CLI-first local document retrieval tool"
9
+ readme = "README.md"
10
+ license = "MIT"
11
+ license-files = ["LICENSE"]
12
+ requires-python = ">=3.12,<3.14"
13
+ keywords = ["cli", "documents", "local-first", "retrieval", "search"]
14
+ classifiers = [
15
+ "Development Status :: 3 - Alpha",
16
+ "Environment :: Console",
17
+ "Intended Audience :: Developers",
18
+ "Programming Language :: Python :: 3",
19
+ "Programming Language :: Python :: 3.12",
20
+ "Programming Language :: Python :: 3.13",
21
+ "Topic :: Text Processing",
22
+ "Topic :: Utilities",
23
+ ]
24
+ dependencies = [
25
+ "chromadb>=0.5.23",
26
+ "llama-index-core>=0.11.23",
27
+ "python-docx",
28
+ "pdfplumber",
29
+ "pypdf",
30
+ "reportlab",
31
+ "sentence-transformers>=3.3.1",
32
+ "typer>=0.12.5",
33
+ ]
34
+
35
+ [project.urls]
36
+ Homepage = "https://github.com/GeorgesAlkhouri/docctl"
37
+ Repository = "https://github.com/GeorgesAlkhouri/docctl"
38
+ Issues = "https://github.com/GeorgesAlkhouri/docctl/issues"
39
+ Changelog = "https://github.com/GeorgesAlkhouri/docctl/blob/main/CHANGELOG.md"
40
+
41
+ [project.scripts]
42
+ docctl = "docctl.cli:main"
43
+
44
+ [dependency-groups]
45
+ dev = [
46
+ "bandit>=1.9.4",
47
+ "import-linter>=2.7",
48
+ "mypy>=1.15.0",
49
+ "python-semantic-release>=10.5.3,<11",
50
+ "pytest>=8.3.4",
51
+ "pytest-cov>=6.1.1",
52
+ "ruff>=0.11.0",
53
+ "twine>=6.1.0,<7",
54
+ ]
55
+
56
+ [tool.uv]
57
+ package = true
58
+
59
+ [tool.semantic_release]
60
+ allow_zero_version = true
61
+ major_on_zero = false
62
+ commit_message = "chore(release): {version}\n\nAutomatically generated by python-semantic-release"
63
+ commit_parser = "conventional"
64
+ tag_format = "v{version}"
65
+ version_toml = ["pyproject.toml:project.version"]
66
+ build_command = "uv lock && git add uv.lock"
67
+
68
+ [tool.semantic_release.branches.main]
69
+ match = "main"
70
+ prerelease = false
71
+
72
+ [tool.semantic_release.changelog]
73
+ mode = "update"
74
+
75
+ [tool.semantic_release.changelog.default_templates]
76
+ changelog_file = "CHANGELOG.md"
77
+ output_format = "md"
78
+ mask_initial_release = true
79
+
80
+ [tool.semantic_release.commit_parser_options]
81
+ minor_tags = ["feat"]
82
+ patch_tags = ["fix", "perf"]
83
+ allowed_tags = ["build", "chore", "ci", "docs", "feat", "fix", "perf", "refactor", "style", "test"]
84
+ other_allowed_tags = ["build", "chore", "ci", "docs", "refactor", "style", "test"]
85
+ default_bump_level = 0
86
+ parse_squash_commits = true
87
+ ignore_merge_commits = true
88
+
89
+ [tool.semantic_release.remote]
90
+ name = "origin"
91
+ type = "github"
92
+ ignore_token_for_push = false
93
+
94
+ [tool.semantic_release.publish]
95
+ dist_glob_patterns = ["dist/*"]
96
+ upload_to_vcs_release = false
97
+
98
+ [tool.pytest.ini_options]
99
+ pythonpath = ["src"]
100
+ addopts = "-ra"
101
+
102
+ [tool.ruff]
103
+ target-version = "py312"
104
+ line-length = 100
105
+ src = ["src", "tests"]
106
+
107
+ [tool.ruff.lint]
108
+ select = [
109
+ "E4",
110
+ "E7",
111
+ "E9",
112
+ "F",
113
+ "I",
114
+ "B",
115
+ "C90",
116
+ "RET",
117
+ "SIM",
118
+ "ARG",
119
+ "PLR0911",
120
+ "PLR0912",
121
+ "PLR0913",
122
+ "PLR0915",
123
+ ]
124
+
125
+ [tool.ruff.lint.flake8-bugbear]
126
+ extend-immutable-calls = ["typer.Argument", "typer.Option"]
127
+
128
+ [tool.ruff.lint.mccabe]
129
+ max-complexity = 8
130
+
131
+ [tool.ruff.lint.pylint]
132
+ max-args = 5
133
+ max-positional-args = 4
134
+ max-branches = 10
135
+ max-locals = 12
136
+ max-public-methods = 12
137
+ max-returns = 5
138
+ max-statements = 35
139
+ max-nested-blocks = 4
140
+
141
+ [tool.ruff.lint.per-file-ignores]
142
+ "tests/**/*.py" = ["ARG", "PLR0913", "PLR0915"]
143
+ "src/docctl/cli.py" = ["PLR0913"]
144
+
145
+ [tool.ruff.format]
146
+ docstring-code-format = true
147
+
148
+ [tool.mypy]
149
+ python_version = "3.12"
150
+ files = ["src"]
151
+ ignore_missing_imports = true
152
+ warn_unused_configs = true
docctl-0.1.0/setup.cfg ADDED
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,4 @@
1
+ """docctl package."""
2
+
3
+ __all__ = ["__version__"]
4
+ __version__ = "0.1.0"
@@ -0,0 +1,79 @@
1
+ """Sentence-aware chunking for extracted documents."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from llama_index.core.node_parser import SentenceSplitter
6
+ from llama_index.core.schema import Document, MetadataMode
7
+
8
+ from .ids import build_chunk_id
9
+ from .models import ChunkMetadata, ChunkRecord, TextUnit
10
+
11
+
12
+ def chunk_document_units( # noqa: PLR0913 - explicit parameters keep the chunking API clear; wrapping into a config object here would add indirection without reducing complexity.
13
+ *,
14
+ doc_id: str,
15
+ source: str,
16
+ title: str,
17
+ units: list[TextUnit],
18
+ chunk_size: int = 220,
19
+ chunk_overlap: int = 40,
20
+ ) -> list[ChunkRecord]:
21
+ """Convert text units into sentence-aware chunks while preserving metadata.
22
+
23
+ Args:
24
+ doc_id: Stable identifier of the source document.
25
+ source: Source path or URI associated with the document.
26
+ title: Human-readable title associated with the document.
27
+ units: Extracted text units.
28
+ chunk_size: Maximum target size for each chunk in characters.
29
+ Smaller values create more, shorter chunks.
30
+ chunk_overlap: Number of trailing characters repeated from one chunk
31
+ into the next chunk to preserve local context across boundaries.
32
+
33
+ Returns:
34
+ Deterministic chunk records with metadata and stable chunk identifiers.
35
+ """
36
+ documents = [
37
+ Document(
38
+ text=unit.text,
39
+ metadata={
40
+ "doc_id": doc_id,
41
+ "source": source,
42
+ "title": title,
43
+ },
44
+ id_=f"{doc_id}:unit",
45
+ )
46
+ for unit in units
47
+ ]
48
+
49
+ splitter = SentenceSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
50
+ nodes = splitter.get_nodes_from_documents(documents)
51
+
52
+ records: list[ChunkRecord] = []
53
+ for node in nodes:
54
+ metadata = dict(node.metadata)
55
+
56
+ text = node.get_content(metadata_mode=MetadataMode.NONE).strip()
57
+ if not text:
58
+ continue
59
+ chunk_index = len(records) + 1
60
+
61
+ chunk_id = build_chunk_id(
62
+ doc_id=doc_id,
63
+ chunk_index=chunk_index,
64
+ text=text,
65
+ )
66
+ records.append(
67
+ ChunkRecord(
68
+ id=chunk_id,
69
+ text=text,
70
+ metadata=ChunkMetadata(
71
+ doc_id=str(metadata.get("doc_id", doc_id)),
72
+ source=str(metadata.get("source", source)),
73
+ title=str(metadata.get("title", title)),
74
+ section=metadata.get("section"),
75
+ ),
76
+ )
77
+ )
78
+
79
+ return records