booktx 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- booktx-0.1.0/.gitignore +225 -0
- booktx-0.1.0/LICENSE +21 -0
- booktx-0.1.0/PKG-INFO +314 -0
- booktx-0.1.0/README.md +273 -0
- booktx-0.1.0/booktx/__init__.py +10 -0
- booktx-0.1.0/booktx/_version.py +24 -0
- booktx-0.1.0/booktx/build.py +291 -0
- booktx-0.1.0/booktx/chapters.py +302 -0
- booktx-0.1.0/booktx/chunking.py +178 -0
- booktx-0.1.0/booktx/cli.py +1786 -0
- booktx-0.1.0/booktx/config.py +421 -0
- booktx-0.1.0/booktx/context.py +494 -0
- booktx-0.1.0/booktx/epub_io.py +113 -0
- booktx-0.1.0/booktx/epub_manifest.py +397 -0
- booktx-0.1.0/booktx/html_io.py +360 -0
- booktx-0.1.0/booktx/markdown_io.py +252 -0
- booktx-0.1.0/booktx/models.py +275 -0
- booktx-0.1.0/booktx/placeholders.py +165 -0
- booktx-0.1.0/booktx/progress.py +130 -0
- booktx-0.1.0/booktx/validate.py +685 -0
- booktx-0.1.0/pyproject.toml +76 -0
- booktx-0.1.0/skills/booktx/SKILL.md +200 -0
- booktx-0.1.0/tests/__init__.py +1 -0
- booktx-0.1.0/tests/test_build.py +319 -0
- booktx-0.1.0/tests/test_chapters.py +161 -0
- booktx-0.1.0/tests/test_chunking.py +217 -0
- booktx-0.1.0/tests/test_cli.py +312 -0
- booktx-0.1.0/tests/test_cli_chapters.py +107 -0
- booktx-0.1.0/tests/test_cli_context.py +166 -0
- booktx-0.1.0/tests/test_cli_translate.py +492 -0
- booktx-0.1.0/tests/test_config.py +166 -0
- booktx-0.1.0/tests/test_context_io.py +135 -0
- booktx-0.1.0/tests/test_context_models.py +145 -0
- booktx-0.1.0/tests/test_epub_io.py +285 -0
- booktx-0.1.0/tests/test_html_io.py +96 -0
- booktx-0.1.0/tests/test_markdown_io.py +129 -0
- booktx-0.1.0/tests/test_models.py +251 -0
- booktx-0.1.0/tests/test_placeholders.py +103 -0
- booktx-0.1.0/tests/test_progress.py +55 -0
- booktx-0.1.0/tests/test_validate.py +395 -0
- booktx-0.1.0/tests/test_validate_context.py +131 -0
booktx-0.1.0/.gitignore
ADDED
|
@@ -0,0 +1,225 @@
|
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[codz]
|
|
4
|
+
*$py.class
|
|
5
|
+
|
|
6
|
+
# C extensions
|
|
7
|
+
*.so
|
|
8
|
+
|
|
9
|
+
# Distribution / packaging
|
|
10
|
+
.Python
|
|
11
|
+
build/
|
|
12
|
+
develop-eggs/
|
|
13
|
+
dist/
|
|
14
|
+
downloads/
|
|
15
|
+
eggs/
|
|
16
|
+
.eggs/
|
|
17
|
+
lib/
|
|
18
|
+
lib64/
|
|
19
|
+
parts/
|
|
20
|
+
sdist/
|
|
21
|
+
var/
|
|
22
|
+
wheels/
|
|
23
|
+
share/python-wheels/
|
|
24
|
+
*.egg-info/
|
|
25
|
+
.installed.cfg
|
|
26
|
+
*.egg
|
|
27
|
+
MANIFEST
|
|
28
|
+
|
|
29
|
+
# PyInstaller
|
|
30
|
+
# Usually these files are written by a python script from a template
|
|
31
|
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
|
32
|
+
*.manifest
|
|
33
|
+
*.spec
|
|
34
|
+
|
|
35
|
+
# Installer logs
|
|
36
|
+
pip-log.txt
|
|
37
|
+
pip-delete-this-directory.txt
|
|
38
|
+
|
|
39
|
+
# Unit test / coverage reports
|
|
40
|
+
htmlcov/
|
|
41
|
+
.tox/
|
|
42
|
+
.nox/
|
|
43
|
+
.coverage
|
|
44
|
+
.coverage.*
|
|
45
|
+
.cache
|
|
46
|
+
nosetests.xml
|
|
47
|
+
coverage.xml
|
|
48
|
+
*.cover
|
|
49
|
+
*.py.cover
|
|
50
|
+
.hypothesis/
|
|
51
|
+
.pytest_cache/
|
|
52
|
+
cover/
|
|
53
|
+
|
|
54
|
+
# Translations
|
|
55
|
+
*.mo
|
|
56
|
+
*.pot
|
|
57
|
+
|
|
58
|
+
# Django stuff:
|
|
59
|
+
*.log
|
|
60
|
+
local_settings.py
|
|
61
|
+
db.sqlite3
|
|
62
|
+
db.sqlite3-journal
|
|
63
|
+
|
|
64
|
+
# Flask stuff:
|
|
65
|
+
instance/
|
|
66
|
+
.webassets-cache
|
|
67
|
+
|
|
68
|
+
# Scrapy stuff:
|
|
69
|
+
.scrapy
|
|
70
|
+
|
|
71
|
+
# Sphinx documentation
|
|
72
|
+
docs/_build/
|
|
73
|
+
|
|
74
|
+
# PyBuilder
|
|
75
|
+
.pybuilder/
|
|
76
|
+
target/
|
|
77
|
+
|
|
78
|
+
# Jupyter Notebook
|
|
79
|
+
.ipynb_checkpoints
|
|
80
|
+
|
|
81
|
+
# IPython
|
|
82
|
+
profile_default/
|
|
83
|
+
ipython_config.py
|
|
84
|
+
|
|
85
|
+
# pyenv
|
|
86
|
+
# For a library or package, you might want to ignore these files since the code is
|
|
87
|
+
# intended to run in multiple environments; otherwise, check them in:
|
|
88
|
+
# .python-version
|
|
89
|
+
|
|
90
|
+
# pipenv
|
|
91
|
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
|
92
|
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
|
93
|
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
|
94
|
+
# install all needed dependencies.
|
|
95
|
+
#Pipfile.lock
|
|
96
|
+
|
|
97
|
+
# UV
|
|
98
|
+
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
|
99
|
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
|
100
|
+
# commonly ignored for libraries.
|
|
101
|
+
#uv.lock
|
|
102
|
+
|
|
103
|
+
# poetry
|
|
104
|
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
|
105
|
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
|
106
|
+
# commonly ignored for libraries.
|
|
107
|
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
|
108
|
+
#poetry.lock
|
|
109
|
+
#poetry.toml
|
|
110
|
+
|
|
111
|
+
# pdm
|
|
112
|
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
|
113
|
+
# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
|
|
114
|
+
# https://pdm-project.org/en/latest/usage/project/#working-with-version-control
|
|
115
|
+
#pdm.lock
|
|
116
|
+
#pdm.toml
|
|
117
|
+
.pdm-python
|
|
118
|
+
.pdm-build/
|
|
119
|
+
|
|
120
|
+
# pixi
|
|
121
|
+
# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
|
|
122
|
+
#pixi.lock
|
|
123
|
+
# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
|
|
124
|
+
# in the .venv directory. It is recommended not to include this directory in version control.
|
|
125
|
+
.pixi
|
|
126
|
+
|
|
127
|
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
|
128
|
+
__pypackages__/
|
|
129
|
+
|
|
130
|
+
# Celery stuff
|
|
131
|
+
celerybeat-schedule
|
|
132
|
+
celerybeat.pid
|
|
133
|
+
|
|
134
|
+
# SageMath parsed files
|
|
135
|
+
*.sage.py
|
|
136
|
+
|
|
137
|
+
# Environments
|
|
138
|
+
.env
|
|
139
|
+
.envrc
|
|
140
|
+
.venv
|
|
141
|
+
env/
|
|
142
|
+
venv/
|
|
143
|
+
ENV/
|
|
144
|
+
env.bak/
|
|
145
|
+
venv.bak/
|
|
146
|
+
|
|
147
|
+
# Spyder project settings
|
|
148
|
+
.spyderproject
|
|
149
|
+
.spyproject
|
|
150
|
+
|
|
151
|
+
# Rope project settings
|
|
152
|
+
.ropeproject
|
|
153
|
+
|
|
154
|
+
# mkdocs documentation
|
|
155
|
+
/site
|
|
156
|
+
|
|
157
|
+
# mypy
|
|
158
|
+
.mypy_cache/
|
|
159
|
+
.dmypy.json
|
|
160
|
+
dmypy.json
|
|
161
|
+
|
|
162
|
+
# Pyre type checker
|
|
163
|
+
.pyre/
|
|
164
|
+
|
|
165
|
+
# pytype static type analyzer
|
|
166
|
+
.pytype/
|
|
167
|
+
|
|
168
|
+
# Cython debug symbols
|
|
169
|
+
cython_debug/
|
|
170
|
+
|
|
171
|
+
# PyCharm
|
|
172
|
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
|
173
|
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
|
174
|
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
|
175
|
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
|
176
|
+
#.idea/
|
|
177
|
+
|
|
178
|
+
# Abstra
|
|
179
|
+
# Abstra is an AI-powered process automation framework.
|
|
180
|
+
# Ignore directories containing user credentials, local state, and settings.
|
|
181
|
+
# Learn more at https://abstra.io/docs
|
|
182
|
+
.abstra/
|
|
183
|
+
|
|
184
|
+
# Visual Studio Code
|
|
185
|
+
# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
|
|
186
|
+
# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
|
|
187
|
+
# and can be added to the global gitignore or merged into this file. However, if you prefer,
|
|
188
|
+
# you could uncomment the following to ignore the entire vscode folder
|
|
189
|
+
# .vscode/
|
|
190
|
+
|
|
191
|
+
# Ruff stuff:
|
|
192
|
+
.ruff_cache/
|
|
193
|
+
|
|
194
|
+
# PyPI configuration file
|
|
195
|
+
.pypirc
|
|
196
|
+
|
|
197
|
+
# Cursor
|
|
198
|
+
# Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
|
|
199
|
+
# exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
|
|
200
|
+
# refer to https://docs.cursor.com/context/ignore-files
|
|
201
|
+
.cursorignore
|
|
202
|
+
.cursorindexingignore
|
|
203
|
+
|
|
204
|
+
# Marimo
|
|
205
|
+
marimo/_static/
|
|
206
|
+
marimo/_lsp/
|
|
207
|
+
__marimo__/
|
|
208
|
+
|
|
209
|
+
.codex
|
|
210
|
+
|
|
211
|
+
booktx/_version.py
|
|
212
|
+
|
|
213
|
+
context_booktx.md
|
|
214
|
+
context_booktx.index.json
|
|
215
|
+
*.unpack.py
|
|
216
|
+
workspace/
|
|
217
|
+
.taskledger/
|
|
218
|
+
.pi-lens/
|
|
219
|
+
.code/
|
|
220
|
+
todo.md
|
|
221
|
+
uv.lock
|
|
222
|
+
/plan.md
|
|
223
|
+
|
|
224
|
+
# Do not ignore .booktx/ingest; it is intentionally durable.
|
|
225
|
+
# Optional ignores remain okay for generated reports/output if desired.
|
booktx-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Holger Nahrstaedt
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
booktx-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,314 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: booktx
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Deterministic CLI that prepares Markdown and EPUB documents for translation by a coding agent.
|
|
5
|
+
Project-URL: Homepage, https://github.com/booktx/booktx
|
|
6
|
+
Project-URL: Repository, https://github.com/holgern/booktx
|
|
7
|
+
Project-URL: Issues, https://github.com/holgern/booktx/issues
|
|
8
|
+
Project-URL: Changelog, https://github.com/holgern/booktx/blob/main/CHANGELOG.md
|
|
9
|
+
Author: booktx contributors
|
|
10
|
+
License-Expression: MIT
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Keywords: cli,epub,localization,markdown,translation
|
|
13
|
+
Classifier: Development Status :: 3 - Alpha
|
|
14
|
+
Classifier: Environment :: Console
|
|
15
|
+
Classifier: Intended Audience :: Developers
|
|
16
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
17
|
+
Classifier: Operating System :: OS Independent
|
|
18
|
+
Classifier: Programming Language :: Python :: 3
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
23
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
24
|
+
Classifier: Topic :: Text Processing :: Markup :: HTML
|
|
25
|
+
Classifier: Topic :: Text Processing :: Markup :: Markdown
|
|
26
|
+
Requires-Python: >=3.10
|
|
27
|
+
Requires-Dist: beautifulsoup4
|
|
28
|
+
Requires-Dist: epub2text>=0.2.4
|
|
29
|
+
Requires-Dist: markdown-it-py
|
|
30
|
+
Requires-Dist: phrasplit
|
|
31
|
+
Requires-Dist: pydantic>=2
|
|
32
|
+
Requires-Dist: rich
|
|
33
|
+
Requires-Dist: text2epub
|
|
34
|
+
Requires-Dist: tomli-w
|
|
35
|
+
Requires-Dist: tomli; python_version < '3.11'
|
|
36
|
+
Requires-Dist: typer
|
|
37
|
+
Provides-Extra: dev
|
|
38
|
+
Requires-Dist: pytest; extra == 'dev'
|
|
39
|
+
Requires-Dist: ruff; extra == 'dev'
|
|
40
|
+
Description-Content-Type: text/markdown
|
|
41
|
+
|
|
42
|
+
[](https://pypi.org/project/booktx/)
|
|
43
|
+

|
|
44
|
+

|
|
45
|
+
|
|
46
|
+
# booktx
|
|
47
|
+
|
|
48
|
+
`booktx` is a deterministic command-line tool that prepares **Markdown** and
|
|
49
|
+
**EPUB** documents for translation by a coding agent (or a human translator).
|
|
50
|
+
It does the mechanical bookkeeping — extract translatable sentences, report
|
|
51
|
+
progress, hand out the next safe work unit, validate submissions, rebuild the
|
|
52
|
+
document — and leaves the actual translation to you or your agent.
|
|
53
|
+
|
|
54
|
+
**booktx never translates text itself** and makes no LLM or network calls. All
|
|
55
|
+
translation text comes from local CLI submissions that `booktx` stores and validates.
|
|
56
|
+
|
|
57
|
+
## Legal notice
|
|
58
|
+
|
|
59
|
+
booktx is intended for DRM-free documents that you lawfully own or are allowed
|
|
60
|
+
to process. The license of booktx applies only to the software, not to input
|
|
61
|
+
books or generated translations. Do not redistribute translated books unless
|
|
62
|
+
you have the rights to do so.
|
|
63
|
+
|
|
64
|
+
---
|
|
65
|
+
|
|
66
|
+
## Install
|
|
67
|
+
|
|
68
|
+
```bash
|
|
69
|
+
pip install -e . # editable install from a checkout
|
|
70
|
+
# or, once published:
|
|
71
|
+
pip install booktx
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
Requires Python 3.10+. The `booktx` console script is installed automatically.
|
|
75
|
+
|
|
76
|
+
## Project layout
|
|
77
|
+
|
|
78
|
+
`booktx init ./book --target de` creates this layout:
|
|
79
|
+
|
|
80
|
+
```text
|
|
81
|
+
book/
|
|
82
|
+
source/
|
|
83
|
+
book.md # or book.epub — exactly one source document
|
|
84
|
+
.booktx/
|
|
85
|
+
config.toml # source/target language, format, chunk size
|
|
86
|
+
manifest.json # source digest + EPUB extraction/rebuild metadata
|
|
87
|
+
names.json # manually protected verbatim terms (names, brands, places)
|
|
88
|
+
context.json # authoritative style/glossary/questions context
|
|
89
|
+
context.md # rendered context that agents must read before translating
|
|
90
|
+
chapter-map.json # detected chapter -> chunk ranges (additive metadata)
|
|
91
|
+
chunks/ # 0001.json, 0002.json ... (booktx writes these)
|
|
92
|
+
translation-store.json # primary record-level translation state
|
|
93
|
+
tasks/ # persisted work items created by booktx translate next
|
|
94
|
+
translated/ # compatibility/export chunk JSON managed by booktx
|
|
95
|
+
reports/ # validation-report.json
|
|
96
|
+
output/
|
|
97
|
+
book.de.md # or book.de.epub — the rebuilt translated document
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
## Commands
|
|
101
|
+
|
|
102
|
+
```bash
|
|
103
|
+
booktx init ./book --target de # create the project
|
|
104
|
+
booktx init ./book --target de --source book.md --source-lang en
|
|
105
|
+
booktx inspect ./book # summarise the source
|
|
106
|
+
booktx extract ./book # write .booktx/chunks/*.json
|
|
107
|
+
booktx context init ./book --non-interactive # create open questions/context
|
|
108
|
+
booktx context questions ./book # show required context questions
|
|
109
|
+
booktx context answer ./book Q001 --text de-DE # answer one context question
|
|
110
|
+
booktx context mark-ready ./book # mark ready after required answers
|
|
111
|
+
booktx status ./book # report translation progress
|
|
112
|
+
booktx chapters ./book # list detected chapter ranges
|
|
113
|
+
booktx translate next ./book --unit batch --max-words 700 --format block
|
|
114
|
+
booktx translate insert ./book --stdin --format block
|
|
115
|
+
booktx translate import-legacy ./book # import valid translated/*.json
|
|
116
|
+
booktx translate export ./book # export full translated chunks
|
|
117
|
+
booktx next ./book # legacy next-chunk summary
|
|
118
|
+
booktx next ./book --unit chapter # legacy next-chapter summary
|
|
119
|
+
booktx next-chapter ./book # chapter workflow shortcut
|
|
120
|
+
booktx validate ./book # enforce contract + context lint
|
|
121
|
+
booktx build ./book # rebuild output/book.<target>.<ext>
|
|
122
|
+
booktx build ./book --require-complete # fail on any missing/invalid record
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
`booktx translate next` refuses to return translation work until `.booktx/context.json`
|
|
126
|
+
exists and has `ready: true`. Use `--allow-missing-context` only for legacy
|
|
127
|
+
workflows and tests that deliberately bypass the context gate.
|
|
128
|
+
|
|
129
|
+
`booktx status` reports record-, chunk-, chapter-, and word-level progress.
|
|
130
|
+
`booktx translate next` returns the next paragraph, batch, chunk, or chapter
|
|
131
|
+
task together with a task id and submit hint. `booktx translate insert`
|
|
132
|
+
validates each submitted record before writing `translation-store.json`.
|
|
133
|
+
|
|
134
|
+
`booktx next --unit chapter` and `booktx next-chapter` remain available as
|
|
135
|
+
legacy summaries, but they now point agents at `booktx translate next` and
|
|
136
|
+
`booktx translate insert`. `booktx chapters` writes `.booktx/chapter-map.json`
|
|
137
|
+
and lists detected chapter ranges.
|
|
138
|
+
|
|
139
|
+
`booktx context init --non-interactive` creates a not-ready context with open
|
|
140
|
+
questions and a seed glossary. Required questions must be answered before
|
|
141
|
+
`booktx context mark-ready` succeeds. `context.md` is generated from
|
|
142
|
+
`context.json`; the JSON file is authoritative.
|
|
143
|
+
|
|
144
|
+
`booktx extract` is **idempotent**: it rebuilds `chunks/` on every run but
|
|
145
|
+
leaves both `translation-store.json` and compatibility `translated/` files
|
|
146
|
+
untouched, so re-extracting after editing the source never destroys work in
|
|
147
|
+
progress. Stale `translated/*.json` files whose chunk no longer exists are kept
|
|
148
|
+
and reported as warnings.
|
|
149
|
+
|
|
150
|
+
## The translation contract
|
|
151
|
+
|
|
152
|
+
`booktx extract` writes a chunk file like this:
|
|
153
|
+
|
|
154
|
+
```json
|
|
155
|
+
{
|
|
156
|
+
"chunk_id": "0001",
|
|
157
|
+
"source_language": "en",
|
|
158
|
+
"target_language": "de",
|
|
159
|
+
"records": [
|
|
160
|
+
{
|
|
161
|
+
"id": "0001-000001",
|
|
162
|
+
"source": "Alice looked at Mr. Smith.",
|
|
163
|
+
"protected_terms": ["Alice", "Mr. Smith"],
|
|
164
|
+
"placeholders": []
|
|
165
|
+
}
|
|
166
|
+
]
|
|
167
|
+
}
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
`booktx translate next` creates a durable `.booktx/ingest/TASK.block.txt` template for block-text submissions and keeps `.booktx/ingest/TASK.json` for compatibility tooling. Prefer `booktx translate insert --stdin --format block` for normal agent work, or submit the durable `.block.txt` file when you want the payload version-controlled. `booktx translate insert --json-file .booktx/ingest/TASK.json` remains available for JSON-based tooling. When you need compatibility chunk files,
|
|
171
|
+
`booktx translate export` materializes `.booktx/translated/NNNN.json` from the
|
|
172
|
+
accepted store entries:
|
|
173
|
+
|
|
174
|
+
```json
|
|
175
|
+
{
|
|
176
|
+
"chunk_id": "0001",
|
|
177
|
+
"records": [
|
|
178
|
+
{
|
|
179
|
+
"id": "0001-000001",
|
|
180
|
+
"target": "Alice sah Mr. Smith an."
|
|
181
|
+
}
|
|
182
|
+
]
|
|
183
|
+
}
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
### Hard rules (enforced by `booktx validate`)
|
|
187
|
+
|
|
188
|
+
A translated chunk is rejected if any of the following is true:
|
|
189
|
+
|
|
190
|
+
- the JSON is invalid, or there is commentary outside the JSON object;
|
|
191
|
+
- the record count changed;
|
|
192
|
+
- any record id changed;
|
|
193
|
+
- any target is empty;
|
|
194
|
+
- a placeholder (`__NAME_NNN__` and any visible legacy `__TAG_NNN__`) was removed, changed, or added;
|
|
195
|
+
- a protected name was translated or removed.
|
|
196
|
+
|
|
197
|
+
The goal is **one source sentence to one translated sentence**. The validator
|
|
198
|
+
never merges or splits records.
|
|
199
|
+
|
|
200
|
+
## Placeholders and protected names
|
|
201
|
+
|
|
202
|
+
Before segmentation, booktx hides protected spans behind stable tokens and
|
|
203
|
+
restores them during build:
|
|
204
|
+
|
|
205
|
+
```text
|
|
206
|
+
Alice -> __NAME_001__ (from names.json#protected_terms)
|
|
207
|
+
Mr. Smith -> __NAME_002__
|
|
208
|
+
inline code -> __TAG_001__ (markdown / legacy EPUB chunks)
|
|
209
|
+
link URL -> __TAG_002__ (markdown / legacy EPUB chunks)
|
|
210
|
+
```
|
|
211
|
+
|
|
212
|
+
Edit `.booktx/names.json` to add names, brands, or places that must survive
|
|
213
|
+
translation untouched:
|
|
214
|
+
|
|
215
|
+
```json
|
|
216
|
+
{
|
|
217
|
+
"protected_terms": ["Alice", "Mr. Smith", "Baker Street"]
|
|
218
|
+
}
|
|
219
|
+
```
|
|
220
|
+
|
|
221
|
+
The agent **must** preserve every visible `__NAME_NNN__` token and every visible
|
|
222
|
+
legacy `__TAG_NNN__` token exactly. New EPUB extraction no longer emits TAG
|
|
223
|
+
tokens; if they appear in freshly extracted EPUB chunks, treat that as a
|
|
224
|
+
maintenance defect and re-run extraction after upgrading.
|
|
225
|
+
|
|
226
|
+
## Markdown handling
|
|
227
|
+
|
|
228
|
+
- Translate prose text only.
|
|
229
|
+
- Do not translate fenced code blocks, inline code, URLs, or YAML front-matter
|
|
230
|
+
**keys** (front-matter values are prose).
|
|
231
|
+
- Preserve headings, lists, blockquotes, links, emphasis, and tables.
|
|
232
|
+
|
|
233
|
+
booktx replaces each extracted prose span with an internal placeholder and
|
|
234
|
+
reinserts the translated text during build.
|
|
235
|
+
|
|
236
|
+
## EPUB handling
|
|
237
|
+
|
|
238
|
+
- Extract with `epub2text`; rebuild with `text2epub`.
|
|
239
|
+
- New EPUB chunks contain clean block text plus `__NAME_NNN__` placeholders only.
|
|
240
|
+
- `booktx build` uses the stored `.booktx/manifest.json` extraction data as the
|
|
241
|
+
source of truth and fails if the source EPUB SHA changed after extract.
|
|
242
|
+
- Identity/no-op EPUB builds are byte-identical to the extracted source EPUB.
|
|
243
|
+
- Changed EPUB blocks rebuild as valid EPUB with no leaked internal tokens.
|
|
244
|
+
- The original source EPUB is never modified.
|
|
245
|
+
|
|
246
|
+
### EPUB MVP inline-formatting tradeoff
|
|
247
|
+
|
|
248
|
+
The current EPUB rebuild path replaces changed blocks with escaped translated
|
|
249
|
+
text for the whole block body. That means identity/no-op builds preserve
|
|
250
|
+
everything byte-for-byte, but changed blocks may lose inner inline markup such
|
|
251
|
+
as `<strong>` or `<em>` until a future text-run-preserving replacement mode is
|
|
252
|
+
added.
|
|
253
|
+
|
|
254
|
+
### EPUB migration note
|
|
255
|
+
|
|
256
|
+
Existing EPUB projects extracted with the legacy TAG-placeholder pipeline should
|
|
257
|
+
be re-extracted after upgrading:
|
|
258
|
+
|
|
259
|
+
```bash
|
|
260
|
+
booktx extract ./project
|
|
261
|
+
```
|
|
262
|
+
|
|
263
|
+
Do not mix old EPUB chunks containing `__TAG_NNN__` with the new manifest v2
|
|
264
|
+
pipeline.
|
|
265
|
+
|
|
266
|
+
## End-to-end example (Markdown)
|
|
267
|
+
|
|
268
|
+
```bash
|
|
269
|
+
booktx init ./demo --target de
|
|
270
|
+
cp book.md ./demo/source/
|
|
271
|
+
booktx extract ./demo
|
|
272
|
+
booktx context init ./demo --non-interactive
|
|
273
|
+
booktx context mark-ready ./demo --force
|
|
274
|
+
booktx translate next ./demo --unit batch --max-words 700 --format block
|
|
275
|
+
|
|
276
|
+
# Submit the returned records through the CLI, then:
|
|
277
|
+
|
|
278
|
+
booktx validate ./demo
|
|
279
|
+
booktx build ./demo # -> demo/output/book.de.md
|
|
280
|
+
```
|
|
281
|
+
|
|
282
|
+
## End-to-end example (EPUB)
|
|
283
|
+
|
|
284
|
+
```bash
|
|
285
|
+
booktx init ./demo --target de --source-file book.epub
|
|
286
|
+
booktx extract ./demo
|
|
287
|
+
booktx context init ./demo --non-interactive
|
|
288
|
+
booktx context mark-ready ./demo --force
|
|
289
|
+
booktx translate next ./demo --chapter 0001 --unit batch --max-words 700 --format block
|
|
290
|
+
|
|
291
|
+
# Submit translated records with booktx translate insert, then:
|
|
292
|
+
|
|
293
|
+
booktx validate ./demo
|
|
294
|
+
booktx build ./demo # -> demo/output/book.de.epub
|
|
295
|
+
```
|
|
296
|
+
|
|
297
|
+
## What v1 does NOT do
|
|
298
|
+
|
|
299
|
+
PDF, DOCX, AsciiDoc, a web UI, direct OpenAI/Anthropic/Ollama API calls, DRM
|
|
300
|
+
handling, automatic publishing, translation memory, or parallel agent
|
|
301
|
+
execution. The CLI itself performs no translation. v1 is intentionally small
|
|
302
|
+
and deterministic.
|
|
303
|
+
|
|
304
|
+
## Development
|
|
305
|
+
|
|
306
|
+
```bash
|
|
307
|
+
pip install -e '.[dev]'
|
|
308
|
+
pytest -q
|
|
309
|
+
ruff check .
|
|
310
|
+
```
|
|
311
|
+
|
|
312
|
+
## License
|
|
313
|
+
|
|
314
|
+
MIT. See [LICENSE](LICENSE).
|