recombinase 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,13 @@
1
+ __pycache__/
2
+ *.py[cod]
3
+ *.egg-info/
4
+ build/
5
+ dist/
6
+ .venv/
7
+ .pytest_cache/
8
+ .coverage
9
+ htmlcov/
10
+ *.pptx
11
+ *.pptm
12
+ !tests/fixtures/*.pptx
13
+ !examples/*.pptx
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Terry Li
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,198 @@
1
+ Metadata-Version: 2.4
2
+ Name: recombinase
3
+ Version: 0.1.0
4
+ Summary: Template-guided document synthesis: extract structured content from source pptx files and recombine into a canonical template
5
+ Project-URL: Homepage, https://github.com/terry-li-hm/recombinase
6
+ Project-URL: Issues, https://github.com/terry-li-hm/recombinase/issues
7
+ Author: Terry Li
8
+ License-Expression: MIT
9
+ License-File: LICENSE
10
+ Keywords: credentials,cv,extraction,generation,powerpoint,pptx,template
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Intended Audience :: End Users/Desktop
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Operating System :: OS Independent
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3 :: Only
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Programming Language :: Python :: 3.13
21
+ Classifier: Topic :: Office/Business
22
+ Classifier: Topic :: Text Processing
23
+ Requires-Python: >=3.10
24
+ Requires-Dist: python-pptx>=0.6.23
25
+ Requires-Dist: pyyaml>=6.0
26
+ Provides-Extra: dev
27
+ Requires-Dist: pytest-cov>=4.0; extra == 'dev'
28
+ Requires-Dist: pytest>=7.0; extra == 'dev'
29
+ Description-Content-Type: text/markdown
30
+
31
+ # recombinase
32
+
33
+ > Biology: a recombinase is an enzyme that extracts DNA fragments and recombines them into new molecules using a homologous template as the structural guide. This package does the same for PowerPoint documents.
34
+
35
+ Template-guided pptx synthesis. Take a styled "filled example" slide in a `.pptx`/`.pptm` template, a folder of per-record YAML data files, and produce a populated output deck — one slide per record, visually identical to the template because the fill operation duplicates the source slide and replaces text in-place by shape name.
36
+
37
+ ## Install
38
+
39
+ ```
40
+ pip install --user recombinase
41
+ ```
42
+
43
+ Or from source:
44
+
45
+ ```
46
+ git clone https://github.com/terry-li-hm/recombinase.git
47
+ cd recombinase
48
+ pip install --user -e .
49
+ ```
50
+
51
+ Dependencies: `python-pptx`, `pyyaml`. That's it.
52
+
53
+ ## Concepts
54
+
55
+ **Three steps**, loosely coupled by file:
56
+
57
+ 1. **Template** — a `.pptx`/`.pptm` file with at least one slide where every field you want to populate is a named shape (e.g. a text box named `Consultant_Name`).
58
+ 2. **Config** — a small YAML file declaring which shape name on the template corresponds to which data field. One config per template. Templates change; configs go with them.
59
+ 3. **Data** — a directory of per-record YAML files, one file per record (e.g. one per consultant). Each file has a flat map of field names to values. List values become bullet paragraphs automatically.
60
+
61
+ The template config is intentionally per-template rather than hardcoded in the package. Same package, different config → different template. You can build a CV pack, a use-case slide deck, or a client case study collection with the same tool and three different configs.
62
+
63
+ ## Usage
64
+
65
+ ### 1. Inspect a template
66
+
67
+ Discover the shape names on each slide — structural metadata only, never the actual text content. Safe to share the output.
68
+
69
+ ```
70
+ recombinase inspect "path/to/template.pptm"
71
+ ```
72
+
73
+ Example output:
74
+
75
+ ```
76
+ File: /path/to/template.pptm
77
+ Slide count: 1
78
+
79
+ === Slide 1 (layout: 'Blank') ===
80
+ - 'Consultant_Name' | type=TEXT_BOX (17) | text_chars=12 | paras=1, runs=1
81
+ - 'Role_Title' | type=TEXT_BOX (17) | text_chars=18 | paras=1, runs=1
82
+ - 'Summary_Body' | type=TEXT_BOX (17) | text_chars=140 | paras=2, runs=3
83
+ - 'Background_Bullets' | type=TEXT_BOX (17) | text_chars=220 | paras=5, runs=5
84
+ ```
85
+
86
+ ### 2. Scaffold a config
87
+
88
+ Generate a starter config file from the template's shape names:
89
+
90
+ ```
91
+ recombinase init "path/to/template.pptm" --output template-config.yaml
92
+ ```
93
+
94
+ This reads the shape names from slide 1 and writes a config like:
95
+
96
+ ```yaml
97
+ template: /path/to/template.pptm
98
+ source_slide_index: 1
99
+ clear_source_slide: true
100
+
101
+ placeholders:
102
+ consultant_name: Consultant_Name
103
+ role_title: Role_Title
104
+ summary_body: Summary_Body
105
+ background_bullets: Background_Bullets
106
+ ```
107
+
108
+ Edit the left side (data field names) to match how your records are keyed. For example, if your YAML data files use `name:` not `consultant_name:`, rename the left side:
109
+
110
+ ```yaml
111
+ placeholders:
112
+ name: Consultant_Name
113
+ role: Role_Title
114
+ summary: Summary_Body
115
+ background: Background_Bullets
116
+ ```
117
+
118
+ ### 3. Write per-record data files
119
+
120
+ Create a directory with one YAML file per record. Filenames become the sort order:
121
+
122
+ ```
123
+ cv-data/
124
+ ├── 01-jane-doe.yaml
125
+ ├── 02-john-smith.yaml
126
+ └── 03-alice-wong.yaml
127
+ ```
128
+
129
+ Each file is a flat map — lists become bullet paragraphs:
130
+
131
+ ```yaml
132
+ id: jane-doe
133
+ name: Jane Doe
134
+ role: Senior Consultant
135
+ summary: >-
136
+ Twelve years across global wealth management with a focus on
137
+ regulatory data and risk modelling.
138
+ background:
139
+ - Bank A — Risk modelling lead (2010-2015)
140
+ - Bank B — Head of analytics (2015-2020)
141
+ - Bank C — CDO, Asia Pacific (2020-present)
142
+ key_skills:
143
+ - Risk modelling
144
+ - Governance
145
+ - Wealth data architecture
146
+ ```
147
+
148
+ The field names on the left must match the keys in your template config's `placeholders:` section.
149
+
150
+ ### 4. Generate the output deck
151
+
152
+ ```
153
+ recombinase generate \
154
+ --config template-config.yaml \
155
+ --data-dir cv-data/ \
156
+ --output output/deck.pptx
157
+ ```
158
+
159
+ Produces a populated pptx with one slide per YAML file. If `clear_source_slide: true` in the config, the original example slide is removed from the output.
160
+
161
+ ### One-line end-to-end
162
+
163
+ After the config exists:
164
+
165
+ ```
166
+ recombinase generate -c template-config.yaml -d cv-data/ -o out.pptx
167
+ ```
168
+
169
+ ## Design notes
170
+
171
+ ### Why duplicate a filled example slide?
172
+
173
+ The alternative is creating slides from a layout and writing text into empty placeholders. That approach loses any hand-tweaks the template designer made (custom colours, tweaked positions, decorative shapes, non-placeholder elements). Duplicating a known-good filled slide inherits 100% of its visual styling by design — `deepcopy` of the shape tree carries every property.
174
+
175
+ Trade-off: the template must contain one "canonical good example" slide to clone from. This is usually natural for CV templates and pack-prep work.
176
+
177
+ ### Rich text and the flattening caveat
178
+
179
+ When a value is written into a shape with `shape.text_frame.text = "..."`, rich-text runs within that shape (bold name, italic subtitle in one text frame) collapse to the placeholder's default run style. For most modern consulting templates this isn't an issue — each styled fragment lives in its own shape. If your template has a multi-run placeholder, either split it into separate shapes or accept the flattening.
180
+
181
+ ### Variable-length lists
182
+
183
+ List values in the YAML data become separate paragraphs in the target text frame, inheriting the placeholder's paragraph-level bullet formatting automatically. No bullet markers in the source data — the template supplies them. A consultant with three background bullets and another with seven both work without any config change.
184
+
185
+ ### Warnings, not errors
186
+
187
+ If a config references a shape name that doesn't exist, or a record is missing a field, `generate` produces a **warning** but continues. This is deliberate: partial output is more useful than total failure during iteration. Pass `--strict` if you want non-zero exit on warnings.
188
+
189
+ ## Scope (v0.1)
190
+
191
+ - ✓ Inspect: print template structural metadata
192
+ - ✓ Init: scaffold a config from shape names
193
+ - ✓ Generate: populate template from YAML records
194
+ - ✗ Extract: reverse direction (pptx → YAML) — v0.2 — needs a sample source file for structure before it can be implemented reliably
195
+
196
+ ## License
197
+
198
+ MIT
@@ -0,0 +1,168 @@
1
+ # recombinase
2
+
3
+ > Biology: a recombinase is an enzyme that extracts DNA fragments and recombines them into new molecules using a homologous template as the structural guide. This package does the same for PowerPoint documents.
4
+
5
+ Template-guided pptx synthesis. Take a styled "filled example" slide in a `.pptx`/`.pptm` template, a folder of per-record YAML data files, and produce a populated output deck — one slide per record, visually identical to the template because the fill operation duplicates the source slide and replaces text in-place by shape name.
6
+
7
+ ## Install
8
+
9
+ ```
10
+ pip install --user recombinase
11
+ ```
12
+
13
+ Or from source:
14
+
15
+ ```
16
+ git clone https://github.com/terry-li-hm/recombinase.git
17
+ cd recombinase
18
+ pip install --user -e .
19
+ ```
20
+
21
+ Dependencies: `python-pptx`, `pyyaml`. That's it.
22
+
23
+ ## Concepts
24
+
25
+ **Three steps**, loosely coupled by file:
26
+
27
+ 1. **Template** — a `.pptx`/`.pptm` file with at least one slide where every field you want to populate is a named shape (e.g. a text box named `Consultant_Name`).
28
+ 2. **Config** — a small YAML file declaring which shape name on the template corresponds to which data field. One config per template. Templates change; configs go with them.
29
+ 3. **Data** — a directory of per-record YAML files, one file per record (e.g. one per consultant). Each file has a flat map of field names to values. List values become bullet paragraphs automatically.
30
+
31
+ The template config is intentionally per-template rather than hardcoded in the package. Same package, different config → different template. You can build a CV pack, a use-case slide deck, or a client case study collection with the same tool and three different configs.
32
+
33
+ ## Usage
34
+
35
+ ### 1. Inspect a template
36
+
37
+ Discover the shape names on each slide — structural metadata only, never the actual text content. Safe to share the output.
38
+
39
+ ```
40
+ recombinase inspect "path/to/template.pptm"
41
+ ```
42
+
43
+ Example output:
44
+
45
+ ```
46
+ File: /path/to/template.pptm
47
+ Slide count: 1
48
+
49
+ === Slide 1 (layout: 'Blank') ===
50
+ - 'Consultant_Name' | type=TEXT_BOX (17) | text_chars=12 | paras=1, runs=1
51
+ - 'Role_Title' | type=TEXT_BOX (17) | text_chars=18 | paras=1, runs=1
52
+ - 'Summary_Body' | type=TEXT_BOX (17) | text_chars=140 | paras=2, runs=3
53
+ - 'Background_Bullets' | type=TEXT_BOX (17) | text_chars=220 | paras=5, runs=5
54
+ ```
55
+
56
+ ### 2. Scaffold a config
57
+
58
+ Generate a starter config file from the template's shape names:
59
+
60
+ ```
61
+ recombinase init "path/to/template.pptm" --output template-config.yaml
62
+ ```
63
+
64
+ This reads the shape names from slide 1 and writes a config like:
65
+
66
+ ```yaml
67
+ template: /path/to/template.pptm
68
+ source_slide_index: 1
69
+ clear_source_slide: true
70
+
71
+ placeholders:
72
+ consultant_name: Consultant_Name
73
+ role_title: Role_Title
74
+ summary_body: Summary_Body
75
+ background_bullets: Background_Bullets
76
+ ```
77
+
78
+ Edit the left side (data field names) to match how your records are keyed. For example, if your YAML data files use `name:` not `consultant_name:`, rename the left side:
79
+
80
+ ```yaml
81
+ placeholders:
82
+ name: Consultant_Name
83
+ role: Role_Title
84
+ summary: Summary_Body
85
+ background: Background_Bullets
86
+ ```
87
+
88
+ ### 3. Write per-record data files
89
+
90
+ Create a directory with one YAML file per record. Filenames become the sort order:
91
+
92
+ ```
93
+ cv-data/
94
+ ├── 01-jane-doe.yaml
95
+ ├── 02-john-smith.yaml
96
+ └── 03-alice-wong.yaml
97
+ ```
98
+
99
+ Each file is a flat map — lists become bullet paragraphs:
100
+
101
+ ```yaml
102
+ id: jane-doe
103
+ name: Jane Doe
104
+ role: Senior Consultant
105
+ summary: >-
106
+ Twelve years across global wealth management with a focus on
107
+ regulatory data and risk modelling.
108
+ background:
109
+ - Bank A — Risk modelling lead (2010-2015)
110
+ - Bank B — Head of analytics (2015-2020)
111
+ - Bank C — CDO, Asia Pacific (2020-present)
112
+ key_skills:
113
+ - Risk modelling
114
+ - Governance
115
+ - Wealth data architecture
116
+ ```
117
+
118
+ The field names on the left must match the keys in your template config's `placeholders:` section.
119
+
120
+ ### 4. Generate the output deck
121
+
122
+ ```
123
+ recombinase generate \
124
+ --config template-config.yaml \
125
+ --data-dir cv-data/ \
126
+ --output output/deck.pptx
127
+ ```
128
+
129
+ Produces a populated pptx with one slide per YAML file. If `clear_source_slide: true` in the config, the original example slide is removed from the output.
130
+
131
+ ### One-line end-to-end
132
+
133
+ After the config exists:
134
+
135
+ ```
136
+ recombinase generate -c template-config.yaml -d cv-data/ -o out.pptx
137
+ ```
138
+
139
+ ## Design notes
140
+
141
+ ### Why duplicate a filled example slide?
142
+
143
+ The alternative is creating slides from a layout and writing text into empty placeholders. That approach loses any hand-tweaks the template designer made (custom colours, tweaked positions, decorative shapes, non-placeholder elements). Duplicating a known-good filled slide inherits 100% of its visual styling by design — `deepcopy` of the shape tree carries every property.
144
+
145
+ Trade-off: the template must contain one "canonical good example" slide to clone from. This is usually natural for CV templates and pack-prep work.
146
+
147
+ ### Rich text and the flattening caveat
148
+
149
+ When a value is written into a shape with `shape.text_frame.text = "..."`, rich-text runs within that shape (bold name, italic subtitle in one text frame) collapse to the placeholder's default run style. For most modern consulting templates this isn't an issue — each styled fragment lives in its own shape. If your template has a multi-run placeholder, either split it into separate shapes or accept the flattening.
150
+
151
+ ### Variable-length lists
152
+
153
+ List values in the YAML data become separate paragraphs in the target text frame, inheriting the placeholder's paragraph-level bullet formatting automatically. No bullet markers in the source data — the template supplies them. A consultant with three background bullets and another with seven both work without any config change.
154
+
155
+ ### Warnings, not errors
156
+
157
+ If a config references a shape name that doesn't exist, or a record is missing a field, `generate` produces a **warning** but continues. This is deliberate: partial output is more useful than total failure during iteration. Pass `--strict` if you want non-zero exit on warnings.
158
+
159
+ ## Scope (v0.1)
160
+
161
+ - ✓ Inspect: print template structural metadata
162
+ - ✓ Init: scaffold a config from shape names
163
+ - ✓ Generate: populate template from YAML records
164
+ - ✗ Extract: reverse direction (pptx → YAML) — v0.2 — needs a sample source file for structure before it can be implemented reliably
165
+
166
+ ## License
167
+
168
+ MIT
@@ -0,0 +1,46 @@
1
+ # Example per-record data file for recombinase.
2
+ #
3
+ # One file per record (e.g. one per consultant). File names become the sort
4
+ # order in the output deck, so prefix with 01-, 02-, etc. if you want a
5
+ # specific ordering.
6
+ #
7
+ # The field names on the LEFT must match the keys in your template config's
8
+ # `placeholders:` section. Values can be scalar strings, integers, or lists.
9
+ # Lists become separate paragraphs (bullet points) in the target shape,
10
+ # inheriting the template's paragraph-level bullet formatting automatically.
11
+
12
+ id: jane-doe
13
+ name: Jane Doe
14
+ role: Senior Consultant
15
+ years_experience: 12
16
+
17
+ summary: >-
18
+ Wealth-data specialist with twelve years across global banking and
19
+ consulting. Led data governance and AI strategy engagements at
20
+ tier-1 APAC financial institutions.
21
+
22
+ background:
23
+ - Bank A — Risk modelling lead (2010-2015)
24
+ - Bank B — Head of analytics (2015-2020)
25
+ - Bank C — Chief Data Officer, Asia Pacific (2020-present)
26
+
27
+ key_skills:
28
+ - Risk modelling and stress testing
29
+ - AI governance and model risk management
30
+ - Wealth data architecture
31
+ - Regulatory reporting automation
32
+
33
+ clients:
34
+ - HSBC
35
+ - UBS
36
+ - DBS
37
+
38
+ qualifications:
39
+ - CFA Charterholder
40
+ - FRM
41
+ - MSc Financial Engineering
42
+
43
+ languages:
44
+ - English (native)
45
+ - Cantonese (fluent)
46
+ - Mandarin (working)
@@ -0,0 +1,33 @@
1
+ # Example template config for recombinase.
2
+ #
3
+ # One config per pptx template. The `template:` path can be absolute or
4
+ # relative to this config file's directory. The `placeholders:` section
5
+ # maps data field names (left) to the shape .Name property in the template
6
+ # (right). You find the shape names by running `recombinase inspect` or
7
+ # `recombinase init` on the template first.
8
+
9
+ template: ./CV_template.pptx
10
+
11
+ # 1-based index of the slide inside the template that should be duplicated
12
+ # once per record. Usually this is the "filled example" slide that's already
13
+ # styled correctly.
14
+ source_slide_index: 1
15
+
16
+ # Remove the source (example) slide from the final output so only the
17
+ # generated per-record slides remain. Set to false if you want the example
18
+ # preserved as slide 1 of the output for reference.
19
+ clear_source_slide: true
20
+
21
+ # Map from data field name → shape .Name in the template.
22
+ # The LEFT side must match the keys in your per-record YAML data files.
23
+ # The RIGHT side must match the shape names reported by `recombinase inspect`.
24
+ placeholders:
25
+ name: Consultant_Name
26
+ role: Role_Title
27
+ years_experience: Years_Experience
28
+ summary: Summary_Body
29
+ background: Background_Bullets
30
+ key_skills: Key_Skills
31
+ clients: Clients_Served
32
+ qualifications: Qualifications
33
+ languages: Languages
@@ -0,0 +1,53 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "recombinase"
7
+ version = "0.1.0"
8
+ description = "Template-guided document synthesis: extract structured content from source pptx files and recombine into a canonical template"
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ license = "MIT"
12
+ authors = [
13
+ { name = "Terry Li" },
14
+ ]
15
+ keywords = ["pptx", "powerpoint", "template", "cv", "credentials", "extraction", "generation"]
16
+ classifiers = [
17
+ "Development Status :: 3 - Alpha",
18
+ "Intended Audience :: End Users/Desktop",
19
+ "License :: OSI Approved :: MIT License",
20
+ "Operating System :: OS Independent",
21
+ "Programming Language :: Python :: 3",
22
+ "Programming Language :: Python :: 3 :: Only",
23
+ "Programming Language :: Python :: 3.10",
24
+ "Programming Language :: Python :: 3.11",
25
+ "Programming Language :: Python :: 3.12",
26
+ "Programming Language :: Python :: 3.13",
27
+ "Topic :: Office/Business",
28
+ "Topic :: Text Processing",
29
+ ]
30
+ dependencies = [
31
+ "python-pptx>=0.6.23",
32
+ "pyyaml>=6.0",
33
+ ]
34
+
35
+ [project.optional-dependencies]
36
+ dev = [
37
+ "pytest>=7.0",
38
+ "pytest-cov>=4.0",
39
+ ]
40
+
41
+ [project.scripts]
42
+ recombinase = "recombinase.cli:main"
43
+
44
+ [project.urls]
45
+ Homepage = "https://github.com/terry-li-hm/recombinase"
46
+ Issues = "https://github.com/terry-li-hm/recombinase/issues"
47
+
48
+ [tool.hatch.build.targets.wheel]
49
+ packages = ["src/recombinase"]
50
+
51
+ [tool.ruff]
52
+ target-version = "py312"
53
+ line-length = 100
@@ -0,0 +1,21 @@
1
+ """recombinase — template-guided document synthesis.
2
+
3
+ Biology: a recombinase is an enzyme that extracts DNA fragments and recombines
4
+ them into new molecules using a homologous template as the structural guide.
5
+ This package does the same for PowerPoint documents: extract content from
6
+ heterogeneous source files, then recombine into a canonical template.
7
+ """
8
+
9
+ from recombinase.config import TemplateConfig, load_config
10
+ from recombinase.generate import generate_deck
11
+ from recombinase.inspect import inspect_template
12
+
13
+ __version__ = "0.1.0"
14
+
15
+ __all__ = [
16
+ "TemplateConfig",
17
+ "load_config",
18
+ "generate_deck",
19
+ "inspect_template",
20
+ "__version__",
21
+ ]