philoch-bib-sdk 0.4.2__tar.gz → 0.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {philoch_bib_sdk-0.4.2 → philoch_bib_sdk-0.5.0}/Cargo.lock +1 -1
- {philoch_bib_sdk-0.4.2 → philoch_bib_sdk-0.5.0}/Cargo.toml +1 -1
- {philoch_bib_sdk-0.4.2 → philoch_bib_sdk-0.5.0}/PKG-INFO +3 -2
- philoch_bib_sdk-0.5.0/docs/generic-style-guide.md +648 -0
- philoch_bib_sdk-0.5.0/philoch_bib_sdk/adapters/api/__init__.py +642 -0
- {philoch_bib_sdk-0.4.2 → philoch_bib_sdk-0.5.0}/philoch_bib_sdk/adapters/io/csv/__init__.py +4 -7
- {philoch_bib_sdk-0.4.2 → philoch_bib_sdk-0.5.0}/philoch_bib_sdk/adapters/io/ods/__init__.py +7 -7
- philoch_bib_sdk-0.5.0/philoch_bib_sdk/converters/api/__init__.py +508 -0
- {philoch_bib_sdk-0.4.2 → philoch_bib_sdk-0.5.0}/philoch_bib_sdk/converters/plaintext/bibitem/bibkey_parser.py +14 -6
- {philoch_bib_sdk-0.4.2 → philoch_bib_sdk-0.5.0}/philoch_bib_sdk/converters/plaintext/bibitem/formatter.py +1 -3
- {philoch_bib_sdk-0.4.2 → philoch_bib_sdk-0.5.0}/philoch_bib_sdk/converters/plaintext/bibitem/parser.py +6 -6
- {philoch_bib_sdk-0.4.2 → philoch_bib_sdk-0.5.0}/philoch_bib_sdk/converters/plaintext/journal/formatter.py +3 -3
- {philoch_bib_sdk-0.4.2 → philoch_bib_sdk-0.5.0}/philoch_bib_sdk/converters/plaintext/shared/renderable_formatter.py +6 -4
- {philoch_bib_sdk-0.4.2 → philoch_bib_sdk-0.5.0}/philoch_bib_sdk/logic/functions/comparator.py +2 -2
- {philoch_bib_sdk-0.4.2 → philoch_bib_sdk-0.5.0}/philoch_bib_sdk/logic/functions/fuzzy_matcher.py +26 -43
- philoch_bib_sdk-0.5.0/philoch_bib_sdk/procedures/import_to_api.py +662 -0
- {philoch_bib_sdk-0.4.2 → philoch_bib_sdk-0.5.0}/poetry.lock +139 -51
- {philoch_bib_sdk-0.4.2 → philoch_bib_sdk-0.5.0}/pyproject.toml +22 -2
- philoch_bib_sdk-0.5.0/scripts/import_ods_to_api.py +140 -0
- {philoch_bib_sdk-0.4.2 → philoch_bib_sdk-0.5.0}/tests/converters/plaintext/conftest.py +4 -5
- {philoch_bib_sdk-0.4.2 → philoch_bib_sdk-0.5.0}/tests/converters/plaintext/test_bibitem_parser.py +31 -31
- {philoch_bib_sdk-0.4.2 → philoch_bib_sdk-0.5.0}/tests/converters/plaintext/test_bibkey_parser.py +2 -0
- philoch_bib_sdk-0.5.0/tests/data/test_import.csv +4 -0
- {philoch_bib_sdk-0.4.2 → philoch_bib_sdk-0.5.0}/tests/logic/functions/test_fuzzy_matcher.py +3 -8
- {philoch_bib_sdk-0.4.2 → philoch_bib_sdk-0.5.0}/tests/logic/test_default_models.py +0 -6
- {philoch_bib_sdk-0.4.2 → philoch_bib_sdk-0.5.0}/tests/logic/test_setup.py +0 -16
- {philoch_bib_sdk-0.4.2 → philoch_bib_sdk-0.5.0}/LICENSE +0 -0
- {philoch_bib_sdk-0.4.2 → philoch_bib_sdk-0.5.0}/README.md +0 -0
- {philoch_bib_sdk-0.4.2 → philoch_bib_sdk-0.5.0}/docs/fuzzy-matching.md +0 -0
- {philoch_bib_sdk-0.4.2 → philoch_bib_sdk-0.5.0}/docs/python-style-guide.md +0 -0
- {philoch_bib_sdk-0.4.2 → philoch_bib_sdk-0.5.0}/docs/rust-implementation-summary.md +0 -0
- {philoch_bib_sdk-0.4.2 → philoch_bib_sdk-0.5.0}/docs/rust-index-building-spec.md +0 -0
- {philoch_bib_sdk-0.4.2 → philoch_bib_sdk-0.5.0}/docs/rust-scorer.md +0 -0
- {philoch_bib_sdk-0.4.2 → philoch_bib_sdk-0.5.0}/docs/streaming-output.md +0 -0
- {philoch_bib_sdk-0.4.2 → philoch_bib_sdk-0.5.0}/docs/todo/fuzzy-matching-enhanced-output.md +0 -0
- {philoch_bib_sdk-0.4.2 → philoch_bib_sdk-0.5.0}/docs/todo/merge_fuzzy_results.py +0 -0
- {philoch_bib_sdk-0.4.2 → philoch_bib_sdk-0.5.0}/docs/todo/rust-build-index-implementation-plan.md +0 -0
- {philoch_bib_sdk-0.4.2 → philoch_bib_sdk-0.5.0}/philoch_bib_sdk/__init__.py +0 -0
- {philoch_bib_sdk-0.4.2 → philoch_bib_sdk-0.5.0}/philoch_bib_sdk/_rust.pyi +0 -0
- {philoch_bib_sdk-0.4.2 → philoch_bib_sdk-0.5.0}/philoch_bib_sdk/adapters/io/__init__.py +0 -0
- {philoch_bib_sdk-0.4.2 → philoch_bib_sdk-0.5.0}/philoch_bib_sdk/adapters/plaintext/bibitem_reader.py +0 -0
- {philoch_bib_sdk-0.4.2 → philoch_bib_sdk-0.5.0}/philoch_bib_sdk/adapters/tabular_data/read_journal_volume_number_index.py +0 -0
- {philoch_bib_sdk-0.4.2 → philoch_bib_sdk-0.5.0}/philoch_bib_sdk/converters/latex.py +0 -0
- {philoch_bib_sdk-0.4.2 → philoch_bib_sdk-0.5.0}/philoch_bib_sdk/converters/plaintext/author/formatter.py +0 -0
- {philoch_bib_sdk-0.4.2 → philoch_bib_sdk-0.5.0}/philoch_bib_sdk/converters/plaintext/author/parser.py +0 -0
- {philoch_bib_sdk-0.4.2 → philoch_bib_sdk-0.5.0}/philoch_bib_sdk/converters/plaintext/bib_string_formatter.py +0 -0
- {philoch_bib_sdk-0.4.2 → philoch_bib_sdk-0.5.0}/philoch_bib_sdk/converters/plaintext/bibitem/bibkey_formatter.py +0 -0
- {philoch_bib_sdk-0.4.2 → philoch_bib_sdk-0.5.0}/philoch_bib_sdk/converters/plaintext/bibitem/date_formatter.py +0 -0
- {philoch_bib_sdk-0.4.2 → philoch_bib_sdk-0.5.0}/philoch_bib_sdk/converters/plaintext/bibitem/date_parser.py +0 -0
- {philoch_bib_sdk-0.4.2 → philoch_bib_sdk-0.5.0}/philoch_bib_sdk/converters/plaintext/bibitem/pages_formatter.py +0 -0
- {philoch_bib_sdk-0.4.2 → philoch_bib_sdk-0.5.0}/philoch_bib_sdk/converters/plaintext/bibitem/pages_parser.py +0 -0
- {philoch_bib_sdk-0.4.2 → philoch_bib_sdk-0.5.0}/philoch_bib_sdk/converters/plaintext/journal/parser.py +0 -0
- {philoch_bib_sdk-0.4.2 → philoch_bib_sdk-0.5.0}/philoch_bib_sdk/interfaces/cli/__init__.py +0 -0
- {philoch_bib_sdk-0.4.2 → philoch_bib_sdk-0.5.0}/philoch_bib_sdk/interfaces/cli/fuzzy_matching.py +0 -0
- {philoch_bib_sdk-0.4.2 → philoch_bib_sdk-0.5.0}/philoch_bib_sdk/logic/__init__.py +0 -0
- {philoch_bib_sdk-0.4.2 → philoch_bib_sdk-0.5.0}/philoch_bib_sdk/logic/default_models.py +0 -0
- {philoch_bib_sdk-0.4.2 → philoch_bib_sdk-0.5.0}/philoch_bib_sdk/logic/functions/__init__.py +0 -0
- {philoch_bib_sdk-0.4.2 → philoch_bib_sdk-0.5.0}/philoch_bib_sdk/logic/functions/journal_article_matcher.py +0 -0
- {philoch_bib_sdk-0.4.2 → philoch_bib_sdk-0.5.0}/philoch_bib_sdk/logic/literals.py +0 -0
- {philoch_bib_sdk-0.4.2 → philoch_bib_sdk-0.5.0}/philoch_bib_sdk/logic/models.py +0 -0
- {philoch_bib_sdk-0.4.2 → philoch_bib_sdk-0.5.0}/philoch_bib_sdk/logic/models_staging.py +0 -0
- {philoch_bib_sdk-0.4.2 → philoch_bib_sdk-0.5.0}/philoch_bib_sdk/procedures/fuzzy_matching.py +0 -0
- {philoch_bib_sdk-0.4.2 → philoch_bib_sdk-0.5.0}/philoch_bib_sdk/py.typed +0 -0
- {philoch_bib_sdk-0.4.2 → philoch_bib_sdk-0.5.0}/run_fuzzy_matching.py +0 -0
- {philoch_bib_sdk-0.4.2 → philoch_bib_sdk-0.5.0}/run_fuzzy_matching_streaming.py +0 -0
- {philoch_bib_sdk-0.4.2 → philoch_bib_sdk-0.5.0}/scripts/format.py +0 -0
- {philoch_bib_sdk-0.4.2 → philoch_bib_sdk-0.5.0}/src/lib.rs +0 -0
- {philoch_bib_sdk-0.4.2 → philoch_bib_sdk-0.5.0}/tests/adapters/test_read_jvn_index_from_ods.py +0 -0
- {philoch_bib_sdk-0.4.2 → philoch_bib_sdk-0.5.0}/tests/converters/plaintext/test_author_formatter.py +0 -0
- {philoch_bib_sdk-0.4.2 → philoch_bib_sdk-0.5.0}/tests/converters/plaintext/test_author_parser.py +0 -0
- {philoch_bib_sdk-0.4.2 → philoch_bib_sdk-0.5.0}/tests/converters/plaintext/test_bibitem_formatter.py +0 -0
- {philoch_bib_sdk-0.4.2 → philoch_bib_sdk-0.5.0}/tests/converters/plaintext/test_bibkey_formatter.py +0 -0
- {philoch_bib_sdk-0.4.2 → philoch_bib_sdk-0.5.0}/tests/converters/plaintext/test_date_formatter.py +0 -0
- {philoch_bib_sdk-0.4.2 → philoch_bib_sdk-0.5.0}/tests/converters/plaintext/test_date_parser.py +0 -0
- {philoch_bib_sdk-0.4.2 → philoch_bib_sdk-0.5.0}/tests/converters/plaintext/test_journal_formatter.py +0 -0
- {philoch_bib_sdk-0.4.2 → philoch_bib_sdk-0.5.0}/tests/converters/plaintext/test_journal_parser.py +0 -0
- {philoch_bib_sdk-0.4.2 → philoch_bib_sdk-0.5.0}/tests/converters/plaintext/test_page_formatter.py +0 -0
- {philoch_bib_sdk-0.4.2 → philoch_bib_sdk-0.5.0}/tests/converters/plaintext/test_page_parser.py +0 -0
- {philoch_bib_sdk-0.4.2 → philoch_bib_sdk-0.5.0}/tests/logic/functions/test_comparator.py +0 -0
- {philoch_bib_sdk-0.4.2 → philoch_bib_sdk-0.5.0}/tests/logic/functions/test_journal_article_matcher.py +0 -0
- {philoch_bib_sdk-0.4.2 → philoch_bib_sdk-0.5.0}/tests/logic/test_models.py +0 -0
- {philoch_bib_sdk-0.4.2 → philoch_bib_sdk-0.5.0}/tests/processing/test_bulk_operation_styles.py +0 -0
- {philoch_bib_sdk-0.4.2 → philoch_bib_sdk-0.5.0}/tests/shared.py +0 -0
- {philoch_bib_sdk-0.4.2 → philoch_bib_sdk-0.5.0}/tests/test_tautology.py +0 -0
|
@@ -1,11 +1,12 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: philoch-bib-sdk
|
|
3
|
-
Version: 0.
|
|
4
|
-
Requires-Dist: aletk>=0.1.
|
|
3
|
+
Version: 0.5.0
|
|
4
|
+
Requires-Dist: aletk>=0.1.8
|
|
5
5
|
Requires-Dist: attrs>=25.3.0
|
|
6
6
|
Requires-Dist: polars>=1.32.3
|
|
7
7
|
Requires-Dist: pydantic>=2.11.9
|
|
8
8
|
Requires-Dist: cytoolz>=1.0.1
|
|
9
|
+
Requires-Dist: httpx>=0.28.0
|
|
9
10
|
License-File: LICENSE
|
|
10
11
|
Summary: Standard development kit for the Philosophie Bibliography project
|
|
11
12
|
Author-email: Luis Alejandro Bordo García <luis.bordo@philosophie.ch>
|
|
@@ -0,0 +1,648 @@
|
|
|
1
|
+
# Python Style Guide
|
|
2
|
+
|
|
3
|
+
An opinionated set of Python standards centered on strict typing, immutability, and functional architecture.
|
|
4
|
+
|
|
5
|
+
## Type Safety
|
|
6
|
+
|
|
7
|
+
Enforce strict type safety using mypy with the `--strict` flag.
|
|
8
|
+
|
|
9
|
+
### Requirements
|
|
10
|
+
|
|
11
|
+
- All functions must have complete type annotations for parameters and return values
|
|
12
|
+
- No use of `Any` type unless justified with a comment explaining why and peer-reviewed
|
|
13
|
+
- No use of `cast()` calls
|
|
14
|
+
- No use of `# type: ignore` comments unless justified with a comment explaining why and peer-reviewed
|
|
15
|
+
- All type errors must be resolved properly through type narrowing or proper type design
|
|
16
|
+
|
|
17
|
+
### Type Narrowing
|
|
18
|
+
|
|
19
|
+
When dealing with union types, use explicit type checking:
|
|
20
|
+
|
|
21
|
+
```python
|
|
22
|
+
value = record.field
|
|
23
|
+
if isinstance(value, ExpectedType):
|
|
24
|
+
result = value.attribute # Type narrowed, safe to access
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
### Preserve Type Safety - Never Convert to Dicts
|
|
28
|
+
|
|
29
|
+
**CRITICAL**: Do not convert typed objects to dictionaries to access attributes. This loses all type safety.
|
|
30
|
+
|
|
31
|
+
```python
|
|
32
|
+
# NEVER DO THIS - loses type safety
|
|
33
|
+
data = record.model_dump() # or __dict__ or dict(record)
|
|
34
|
+
name = data.get("name", "") # Type checker cannot verify this
|
|
35
|
+
|
|
36
|
+
# ALWAYS DO THIS - preserves type safety
|
|
37
|
+
name_field = record.name
|
|
38
|
+
if isinstance(name_field, NameType):
|
|
39
|
+
name = name_field.value
|
|
40
|
+
else:
|
|
41
|
+
name = ""
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
Reasons:
|
|
45
|
+
- Dictionary access bypasses type checking completely
|
|
46
|
+
- Typos in keys are not caught by mypy
|
|
47
|
+
- Attribute renames do not update dictionary keys automatically
|
|
48
|
+
- Type narrowing is lost, leading to runtime errors
|
|
49
|
+
|
|
50
|
+
Always access attributes directly and use isinstance() for type narrowing.
|
|
51
|
+
|
|
52
|
+
### Forward References
|
|
53
|
+
|
|
54
|
+
Use `TYPE_CHECKING` for imports that are only needed for type annotations to avoid circular imports:
|
|
55
|
+
|
|
56
|
+
```python
|
|
57
|
+
from typing import TYPE_CHECKING
|
|
58
|
+
|
|
59
|
+
if TYPE_CHECKING:
|
|
60
|
+
from mypackage.models import SomeModel
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
## Pydantic at the Boundaries
|
|
64
|
+
|
|
65
|
+
Use Pydantic models exclusively at system boundaries - the primary side (user input, CLI arguments, configuration files) and the secondary side (API responses, database rows, external service payloads). Pydantic's validation overhead is justified here because this is where untrusted data enters the system.
|
|
66
|
+
|
|
67
|
+
```python
|
|
68
|
+
# Primary side: parsing user input
|
|
69
|
+
class CreateUserRequest(BaseModel):
|
|
70
|
+
model_config = ConfigDict(strict=True)
|
|
71
|
+
name: str
|
|
72
|
+
email: EmailStr
|
|
73
|
+
|
|
74
|
+
# Secondary side: parsing an external API response
|
|
75
|
+
class ExternalPayload(BaseModel):
|
|
76
|
+
model_config = ConfigDict(strict=True)
|
|
77
|
+
id: int
|
|
78
|
+
status: str
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
Once data crosses a boundary and is validated, convert it into lightweight internal representations (e.g., frozen attrs classes, named tuples, or plain typed values) for all further processing. Do not pass Pydantic models through core business logic:
|
|
82
|
+
|
|
83
|
+
```python
|
|
84
|
+
# At the boundary: validate, then convert
|
|
85
|
+
request = CreateUserRequest.model_validate(raw_input)
|
|
86
|
+
user = User(name=request.name, email=request.email) # attrs/dataclass
|
|
87
|
+
|
|
88
|
+
# Inside the core: work with lightweight, validated data
|
|
89
|
+
def process_user(user: User) -> Result:
|
|
90
|
+
...
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
This approach gives you:
|
|
94
|
+
- **Fail-fast guarantees** - malformed data is rejected immediately at the edges
|
|
95
|
+
- **Runtime consistency** - everything inside the core is already validated
|
|
96
|
+
- **No hidden overhead** - Pydantic validation runs once, not on every function call
|
|
97
|
+
- **Clean separation** - boundary concerns (parsing, serialization) stay out of business logic
|
|
98
|
+
|
|
99
|
+
## Data Structures and Idioms
|
|
100
|
+
|
|
101
|
+
The following patterns are preferred for immutability and clarity:
|
|
102
|
+
|
|
103
|
+
1. **Prefer tuples over lists** for sequences that do not need mutation
|
|
104
|
+
```python
|
|
105
|
+
items = tuple(process(x) for x in source) # Preferred
|
|
106
|
+
items = [process(x) for x in source] # Avoid
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
2. **Prefer `FrozenSet` over `Set`** for immutable unique collections
|
|
110
|
+
|
|
111
|
+
3. **Prefer comprehensions over explicit loops** when the logic is straightforward
|
|
112
|
+
```python
|
|
113
|
+
# Preferred
|
|
114
|
+
results = {key: frozenset(items) for key, items in mapping.items()}
|
|
115
|
+
|
|
116
|
+
# Avoid
|
|
117
|
+
results = {}
|
|
118
|
+
for key, items in mapping.items():
|
|
119
|
+
results[key] = frozenset(items)
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
4. **Use set operations** for collection operations
|
|
123
|
+
```python
|
|
124
|
+
# Preferred
|
|
125
|
+
candidates.update(index[key])
|
|
126
|
+
|
|
127
|
+
# Avoid
|
|
128
|
+
for item in index[key]:
|
|
129
|
+
candidates.add(item)
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
5. **Use `frozen=True` and `slots=True`** on data classes for immutability and memory efficiency (e.g., `attrs.define(frozen=True, slots=True)` or `@dataclass(frozen=True, slots=True)`)
|
|
133
|
+
|
|
134
|
+
6. **Use `Enum` or `StrEnum` for closed sets of values** — this enables exhaustive `match` checking via mypy's `exhaustive-match` error code
|
|
135
|
+
```python
|
|
136
|
+
from enum import StrEnum
|
|
137
|
+
|
|
138
|
+
class Status(StrEnum):
|
|
139
|
+
ACTIVE = "active"
|
|
140
|
+
INACTIVE = "inactive"
|
|
141
|
+
PENDING = "pending"
|
|
142
|
+
|
|
143
|
+
def handle(status: Status) -> str:
|
|
144
|
+
match status:
|
|
145
|
+
case Status.ACTIVE: return "go"
|
|
146
|
+
case Status.INACTIVE: return "stop"
|
|
147
|
+
case Status.PENDING: return "wait"
|
|
148
|
+
# mypy error if a case is missing
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
## Performance
|
|
152
|
+
|
|
153
|
+
### Avoid N+1 Problems
|
|
154
|
+
|
|
155
|
+
Never perform I/O inside a loop when a batch operation is available. This is the single most common performance mistake:
|
|
156
|
+
|
|
157
|
+
```python
|
|
158
|
+
# NEVER DO THIS - N+1: one query per item
|
|
159
|
+
results = tuple(fetch(item_id) for item_id in item_ids)
|
|
160
|
+
|
|
161
|
+
# ALWAYS DO THIS - single batch call
|
|
162
|
+
results = batch_fetch(item_ids)
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
The same applies to HTTP calls, file reads, and any other I/O. If you are calling an external service per item, look for a batch endpoint or gather inputs first.
|
|
166
|
+
|
|
167
|
+
### Use Appropriate Data Structures for Lookups
|
|
168
|
+
|
|
169
|
+
Choose data structures based on access patterns:
|
|
170
|
+
|
|
171
|
+
```python
|
|
172
|
+
# Membership testing: use a set, not a list
|
|
173
|
+
valid_ids: frozenset[int] = frozenset(load_valid_ids())
|
|
174
|
+
if item_id in valid_ids: # O(1)
|
|
175
|
+
...
|
|
176
|
+
|
|
177
|
+
# Keyed access: use a dict, not linear search
|
|
178
|
+
users_by_id: dict[int, User] = {u.id: u for u in users}
|
|
179
|
+
user = users_by_id[target_id] # O(1)
|
|
180
|
+
|
|
181
|
+
# Avoid: scanning a list for every lookup — O(n) per call
|
|
182
|
+
user = next(u for u in users if u.id == target_id)
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
### Avoid Nested Loops over Large Collections
|
|
186
|
+
|
|
187
|
+
Nested iteration over two large collections is O(n*m). Restructure with index lookups:
|
|
188
|
+
|
|
189
|
+
```python
|
|
190
|
+
# Avoid - O(n * m)
|
|
191
|
+
matched = tuple(
|
|
192
|
+
(o, p)
|
|
193
|
+
for o in orders
|
|
194
|
+
for p in products
|
|
195
|
+
if o.product_id == p.id
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
# Preferred - O(n + m): build an index, then join
|
|
199
|
+
products_by_id = {p.id: p for p in products}
|
|
200
|
+
matched = tuple(
|
|
201
|
+
(o, products_by_id[o.product_id])
|
|
202
|
+
for o in orders
|
|
203
|
+
if o.product_id in products_by_id
|
|
204
|
+
)
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
### Prefer Generators for Large Pipelines
|
|
208
|
+
|
|
209
|
+
When processing large datasets, use generator expressions to keep memory usage constant. Each item flows through the entire chain before the next is pulled:
|
|
210
|
+
|
|
211
|
+
```python
|
|
212
|
+
# Constant memory - items processed one at a time
|
|
213
|
+
validated = (validate(item) for item in raw_items)
|
|
214
|
+
transformed = (transform(item) for item in validated)
|
|
215
|
+
write_output(transformed)
|
|
216
|
+
|
|
217
|
+
# Avoid - loads entire dataset into memory at each step
|
|
218
|
+
validated = [validate(item) for item in raw_items]
|
|
219
|
+
transformed = [transform(item) for item in validated]
|
|
220
|
+
write_output(transformed)
|
|
221
|
+
```
|
|
222
|
+
|
|
223
|
+
See the [Streaming with Chained Generators](#streaming-with-chained-generators) subsection for the full pattern.
|
|
224
|
+
|
|
225
|
+
### Profile Before Optimizing
|
|
226
|
+
|
|
227
|
+
Do not guess at bottlenecks. Measure first with `cProfile` or `line_profiler`, then optimize the hot path:
|
|
228
|
+
|
|
229
|
+
```bash
|
|
230
|
+
python -m cProfile -s cumtime my_script.py
|
|
231
|
+
```
|
|
232
|
+
|
|
233
|
+
## Functional Architecture (Hexagonal / Ports & Adapters)
|
|
234
|
+
|
|
235
|
+
Follow hexagonal architecture principles using functional programming. The key insight: **hexagonal architecture doesn't require OOP** - function signatures serve as interfaces.
|
|
236
|
+
|
|
237
|
+
### Core Principles
|
|
238
|
+
|
|
239
|
+
1. **Business logic doesn't depend on I/O details**
|
|
240
|
+
2. **Dependencies point inward** - concrete implementations depend on abstract interfaces
|
|
241
|
+
3. **Ports define what you need** - type aliases for function signatures
|
|
242
|
+
4. **Adapters provide how** - concrete implementations matching those signatures
|
|
243
|
+
|
|
244
|
+
### Defining Ports (Abstract Interfaces)
|
|
245
|
+
|
|
246
|
+
Use type aliases to define the "shape" of functions your core logic needs:
|
|
247
|
+
|
|
248
|
+
```python
|
|
249
|
+
from typing import Callable, Generator
|
|
250
|
+
|
|
251
|
+
# Port: what the core logic needs (abstract)
|
|
252
|
+
type TItemReader[ReaderIn] = Callable[[ReaderIn], Generator[Item, None, None]]
|
|
253
|
+
type TItemWriter[WriterIn] = Callable[[Generator[Item, None, None], WriterIn], None]
|
|
254
|
+
type TTransform = Callable[[str], str]
|
|
255
|
+
```
|
|
256
|
+
|
|
257
|
+
The type signature **is** the contract. Any function matching that signature can be injected.
|
|
258
|
+
|
|
259
|
+
### Implementing Adapters (Concrete Implementations)
|
|
260
|
+
|
|
261
|
+
Create concrete functions that satisfy the port signatures:
|
|
262
|
+
|
|
263
|
+
```python
|
|
264
|
+
# Adapter: filesystem implementation
|
|
265
|
+
def read_from_filesystem(input_dirname: str) -> Generator[Item, None, None]:
|
|
266
|
+
for file_name in os.listdir(input_dirname):
|
|
267
|
+
yield read_file(os.path.join(input_dirname, file_name))
|
|
268
|
+
|
|
269
|
+
# Adapter: database implementation (alternative)
|
|
270
|
+
def read_from_database(connection_string: str) -> Generator[Item, None, None]:
|
|
271
|
+
# ... database-specific logic
|
|
272
|
+
```
|
|
273
|
+
|
|
274
|
+
### Abstract Process Functions
|
|
275
|
+
|
|
276
|
+
Write core logic that accepts injected functions:
|
|
277
|
+
|
|
278
|
+
```python
|
|
279
|
+
def abstract_process[I, O](
|
|
280
|
+
reader: TItemReader[I],
|
|
281
|
+
reader_input: I,
|
|
282
|
+
transform: TTransform,
|
|
283
|
+
writer: TItemWriter[O],
|
|
284
|
+
writer_input: O,
|
|
285
|
+
) -> None:
|
|
286
|
+
"""Core business logic - knows nothing about filesystems, databases, etc."""
|
|
287
|
+
raw_items = reader(reader_input)
|
|
288
|
+
processed = (transform(item) for item in raw_items)
|
|
289
|
+
writer(processed, writer_input)
|
|
290
|
+
```
|
|
291
|
+
|
|
292
|
+
### Wiring: Injecting Dependencies
|
|
293
|
+
|
|
294
|
+
Create concrete entry points that wire everything together:
|
|
295
|
+
|
|
296
|
+
```python
|
|
297
|
+
def main_filesystem(input_dir: str, output_dir: str) -> None:
|
|
298
|
+
"""Concrete implementation using filesystem adapters."""
|
|
299
|
+
abstract_process(
|
|
300
|
+
reader=read_from_filesystem,
|
|
301
|
+
reader_input=input_dir,
|
|
302
|
+
transform=my_transform_function,
|
|
303
|
+
writer=write_to_filesystem,
|
|
304
|
+
writer_input=output_dir,
|
|
305
|
+
)
|
|
306
|
+
```
|
|
307
|
+
|
|
308
|
+
### Benefits Over OOP-Style Dependency Injection
|
|
309
|
+
|
|
310
|
+
| Aspect | FP Style | OOP Style |
|
|
311
|
+
|--------|----------|-----------|
|
|
312
|
+
| Interface definition | Type alias | Abstract class/Protocol |
|
|
313
|
+
| Boilerplate | Minimal | Class definitions, `__init__`, etc. |
|
|
314
|
+
| Testing | Pass mock functions directly | Mock objects, DI frameworks |
|
|
315
|
+
| Composition | Natural function composition | Decorator pattern, etc. |
|
|
316
|
+
| State | Explicit (parameters) | Hidden in `self` |
|
|
317
|
+
|
|
318
|
+
### When to Use This Pattern
|
|
319
|
+
|
|
320
|
+
Use functional hexagonal architecture when:
|
|
321
|
+
|
|
322
|
+
- Processing pipelines (read → transform → write)
|
|
323
|
+
- Multiple I/O backends are possible (filesystem, database, API)
|
|
324
|
+
- Business logic should be testable in isolation
|
|
325
|
+
- You want to swap implementations without changing core logic
|
|
326
|
+
|
|
327
|
+
### Example: Complete Module Structure
|
|
328
|
+
|
|
329
|
+
```python
|
|
330
|
+
# types.py - Port definitions
|
|
331
|
+
type TValidate = Callable[[str], str]
|
|
332
|
+
type TSanitize = Callable[[str], str]
|
|
333
|
+
type TItemReader[In] = Callable[[In], Generator[Item, None, None]]
|
|
334
|
+
type TItemWriter[Out] = Callable[[Generator[Item, None, None], Out], None]
|
|
335
|
+
|
|
336
|
+
# core.py - Abstract process (pure business logic)
|
|
337
|
+
def process_content(
|
|
338
|
+
content: str,
|
|
339
|
+
validate: TValidate,
|
|
340
|
+
sanitize: TSanitize,
|
|
341
|
+
) -> str:
|
|
342
|
+
return sanitize(validate(content))
|
|
343
|
+
|
|
344
|
+
def abstract_process[I, O](...) -> None:
|
|
345
|
+
# Orchestration logic
|
|
346
|
+
|
|
347
|
+
# adapters/filesystem.py - Filesystem adapter
|
|
348
|
+
def filesystem_reader(dirname: str) -> Generator[Item, None, None]: ...
|
|
349
|
+
def filesystem_writer(items: Generator[Item, None, None], dirname: str) -> None: ...
|
|
350
|
+
|
|
351
|
+
# adapters/transforms.py - Transform implementations
|
|
352
|
+
def regex_validate(content: str) -> str: ...
|
|
353
|
+
def html_sanitize(content: str) -> str: ...
|
|
354
|
+
|
|
355
|
+
# main.py - Wiring
|
|
356
|
+
def main_filesystem(input_dir: str, output_dir: str) -> None:
|
|
357
|
+
abstract_process(
|
|
358
|
+
reader=filesystem_reader,
|
|
359
|
+
reader_input=input_dir,
|
|
360
|
+
validate=regex_validate,
|
|
361
|
+
sanitize=html_sanitize,
|
|
362
|
+
writer=filesystem_writer,
|
|
363
|
+
writer_input=output_dir,
|
|
364
|
+
)
|
|
365
|
+
```
|
|
366
|
+
|
|
367
|
+
### Streaming with Chained Generators
|
|
368
|
+
|
|
369
|
+
This extends the `abstract_process` pattern shown above with multiple chained transformation steps. Each step is lazy - no intermediate lists are allocated - and only the terminal function at the end of the chain triggers evaluation and produces side effects:
|
|
370
|
+
|
|
371
|
+
```python
|
|
372
|
+
from typing import Callable, Generator
|
|
373
|
+
|
|
374
|
+
type TReader[In] = Callable[[In], Generator[Item, None, None]]
|
|
375
|
+
type TTransform = Callable[[Item], Item]
|
|
376
|
+
type TToRow = Callable[[Item], str]
|
|
377
|
+
type TWriter[Out] = Callable[[Generator[str, None, None], Out], None]
|
|
378
|
+
|
|
379
|
+
def process_pipeline[I, O](
|
|
380
|
+
reader: TReader[I],
|
|
381
|
+
reader_input: I,
|
|
382
|
+
validate: TTransform,
|
|
383
|
+
transform: TTransform,
|
|
384
|
+
to_row: TToRow,
|
|
385
|
+
writer: TWriter[O],
|
|
386
|
+
writer_output: O,
|
|
387
|
+
) -> None:
|
|
388
|
+
raw_items = reader(reader_input) # Generator[Item, None, None]
|
|
389
|
+
validated = (validate(item) for item in raw_items) # lazy
|
|
390
|
+
transformed = (transform(item) for item in validated) # lazy
|
|
391
|
+
rows = (to_row(item) for item in transformed) # lazy
|
|
392
|
+
writer(rows, writer_output) # terminal: consumes the chain
|
|
393
|
+
```
|
|
394
|
+
|
|
395
|
+
The entire chain evaluates one item at a time, end to end, before pulling the next. This keeps memory usage constant regardless of input size. The terminal function (here `writer`) is the only place where side effects occur - everything upstream is a pure transformation.
|
|
396
|
+
|
|
397
|
+
This composes naturally with the hexagonal architecture: `reader` and `writer` are adapters, `validate`, `transform`, and `to_row` are injected core functions, and `process_pipeline` is the wiring.
|
|
398
|
+
|
|
399
|
+
## Code Organization
|
|
400
|
+
|
|
401
|
+
### Module Structure
|
|
402
|
+
|
|
403
|
+
- **Types/Ports**: Type aliases defining function signatures (interfaces)
|
|
404
|
+
- **Core**: Abstract process functions that accept injected dependencies
|
|
405
|
+
- **Adapters**: Concrete implementations for I/O and transformations
|
|
406
|
+
- **Main/Wiring**: Entry points that wire adapters into core logic
|
|
407
|
+
- **Models**: Define data structures using frozen, slotted classes (e.g., `attrs.define(frozen=True, slots=True)`)
|
|
408
|
+
- **No classes** except for simple data containers and index structures
|
|
409
|
+
|
|
410
|
+
### Function Design
|
|
411
|
+
|
|
412
|
+
Functions should be:
|
|
413
|
+
|
|
414
|
+
- **Pure** when possible (no side effects)
|
|
415
|
+
- **Small and focused** (single responsibility)
|
|
416
|
+
- **Composable** (easy to combine with other functions)
|
|
417
|
+
- **Injectable** (accept dependencies as parameters rather than importing them)
|
|
418
|
+
|
|
419
|
+
### Imports
|
|
420
|
+
|
|
421
|
+
Group imports in the following order:
|
|
422
|
+
|
|
423
|
+
1. Standard library imports
|
|
424
|
+
2. Third-party library imports
|
|
425
|
+
3. Local application imports
|
|
426
|
+
|
|
427
|
+
Within each group, sort alphabetically.
|
|
428
|
+
|
|
429
|
+
### Logging over Print
|
|
430
|
+
|
|
431
|
+
Use the `logging` module for all output beyond throwaway debugging. `print` statements should not appear in committed code.
|
|
432
|
+
|
|
433
|
+
```python
|
|
434
|
+
import logging
|
|
435
|
+
|
|
436
|
+
logger = logging.getLogger(__name__)
|
|
437
|
+
|
|
438
|
+
logger.info("Processing %d items", count)
|
|
439
|
+
logger.error("Failed to connect to %s", url)
|
|
440
|
+
```
|
|
441
|
+
|
|
442
|
+
## Testing
|
|
443
|
+
|
|
444
|
+
### Test Requirements
|
|
445
|
+
|
|
446
|
+
- All new functionality must have corresponding tests
|
|
447
|
+
- Tests must pass with `pytest`
|
|
448
|
+
- Test coverage should be comprehensive
|
|
449
|
+
- Tests should be deterministic and fast
|
|
450
|
+
|
|
451
|
+
### Test Structure
|
|
452
|
+
|
|
453
|
+
```python
|
|
454
|
+
def test_feature_description() -> None:
|
|
455
|
+
"""Brief description of what is being tested."""
|
|
456
|
+
# Arrange
|
|
457
|
+
input_data = create_test_data()
|
|
458
|
+
|
|
459
|
+
# Act
|
|
460
|
+
result = function_under_test(input_data)
|
|
461
|
+
|
|
462
|
+
# Assert
|
|
463
|
+
assert result == expected_value
|
|
464
|
+
```
|
|
465
|
+
|
|
466
|
+
### Parametrized Tests
|
|
467
|
+
|
|
468
|
+
Use `pytest.mark.parametrize` for testing multiple cases:
|
|
469
|
+
|
|
470
|
+
```python
|
|
471
|
+
@pytest.mark.parametrize(
|
|
472
|
+
"input_value, expected_output",
|
|
473
|
+
[
|
|
474
|
+
(case1_input, case1_output),
|
|
475
|
+
(case2_input, case2_output),
|
|
476
|
+
],
|
|
477
|
+
)
|
|
478
|
+
def test_multiple_cases(input_value: str, expected_output: str) -> None:
|
|
479
|
+
assert transform(input_value) == expected_output
|
|
480
|
+
```
|
|
481
|
+
|
|
482
|
+
## Documentation
|
|
483
|
+
|
|
484
|
+
### Docstrings
|
|
485
|
+
|
|
486
|
+
All public functions and classes must have docstrings following this format:
|
|
487
|
+
|
|
488
|
+
```python
|
|
489
|
+
def function_name(param1: Type1, param2: Type2) -> ReturnType:
|
|
490
|
+
"""Brief one-line description.
|
|
491
|
+
|
|
492
|
+
Longer description if needed, explaining the purpose and behavior.
|
|
493
|
+
|
|
494
|
+
Args:
|
|
495
|
+
param1: Description of first parameter
|
|
496
|
+
param2: Description of second parameter
|
|
497
|
+
|
|
498
|
+
Returns:
|
|
499
|
+
Description of return value
|
|
500
|
+
"""
|
|
501
|
+
```
|
|
502
|
+
|
|
503
|
+
### Comments
|
|
504
|
+
|
|
505
|
+
- Use comments sparingly - prefer self-documenting code
|
|
506
|
+
- Explain **why**, not **what** (the code shows what)
|
|
507
|
+
- Update comments when code changes
|
|
508
|
+
|
|
509
|
+
## Formatting
|
|
510
|
+
|
|
511
|
+
### General Style
|
|
512
|
+
|
|
513
|
+
- Follow PEP 8 conventions
|
|
514
|
+
- Line length: 88 characters (Black default)
|
|
515
|
+
- Use double quotes for strings
|
|
516
|
+
- Use trailing commas in multi-line structures
|
|
517
|
+
|
|
518
|
+
### Function Signatures
|
|
519
|
+
|
|
520
|
+
For functions with many parameters, format each parameter on its own line:
|
|
521
|
+
|
|
522
|
+
```python
|
|
523
|
+
def complex_function(
|
|
524
|
+
parameter1: Type1,
|
|
525
|
+
parameter2: Type2,
|
|
526
|
+
parameter3: Type3 = default_value,
|
|
527
|
+
) -> ReturnType:
|
|
528
|
+
pass
|
|
529
|
+
```
|
|
530
|
+
|
|
531
|
+
## Error Handling
|
|
532
|
+
|
|
533
|
+
### Type-Safe Error Handling
|
|
534
|
+
|
|
535
|
+
Handle expected errors explicitly:
|
|
536
|
+
|
|
537
|
+
```python
|
|
538
|
+
# Check conditions and return early
|
|
539
|
+
if not valid_input(data):
|
|
540
|
+
return default_value
|
|
541
|
+
|
|
542
|
+
# Use isinstance for type narrowing
|
|
543
|
+
if isinstance(value, ExpectedType):
|
|
544
|
+
process(value)
|
|
545
|
+
```
|
|
546
|
+
|
|
547
|
+
### Avoid Bare Except
|
|
548
|
+
|
|
549
|
+
Always catch specific exceptions:
|
|
550
|
+
|
|
551
|
+
```python
|
|
552
|
+
# Preferred
|
|
553
|
+
try:
|
|
554
|
+
risky_operation()
|
|
555
|
+
except ValueError as e:
|
|
556
|
+
handle_value_error(e)
|
|
557
|
+
|
|
558
|
+
# Avoid
|
|
559
|
+
try:
|
|
560
|
+
risky_operation()
|
|
561
|
+
except:
|
|
562
|
+
pass
|
|
563
|
+
```
|
|
564
|
+
|
|
565
|
+
## Version Control
|
|
566
|
+
|
|
567
|
+
### Commit Messages
|
|
568
|
+
|
|
569
|
+
Follow conventional commits format:
|
|
570
|
+
|
|
571
|
+
```
|
|
572
|
+
type(scope): brief description
|
|
573
|
+
|
|
574
|
+
Longer explanation if needed.
|
|
575
|
+
```
|
|
576
|
+
|
|
577
|
+
Types: `feat`, `fix`, `refactor`, `test`, `docs`, `chore`
|
|
578
|
+
|
|
579
|
+
## Tools
|
|
580
|
+
|
|
581
|
+
### Required Tools
|
|
582
|
+
|
|
583
|
+
- `mypy` - Type checking with `--strict` mode
|
|
584
|
+
- `pytest` - Testing framework
|
|
585
|
+
- A dependency manager such as `poetry`, `uv`, or `pip`
|
|
586
|
+
|
|
587
|
+
### Running Checks
|
|
588
|
+
|
|
589
|
+
```bash
|
|
590
|
+
# Type checking
|
|
591
|
+
mypy .
|
|
592
|
+
|
|
593
|
+
# Run tests
|
|
594
|
+
pytest
|
|
595
|
+
|
|
596
|
+
# Run specific test file
|
|
597
|
+
pytest tests/path/to/test_file.py -v
|
|
598
|
+
```
|
|
599
|
+
|
|
600
|
+
### Suggested `pyproject.toml` Configuration
|
|
601
|
+
|
|
602
|
+
A strict mypy + Pydantic setup (works with any build backend — Poetry, uv, etc.):
|
|
603
|
+
|
|
604
|
+
```toml
|
|
605
|
+
[tool.mypy]
|
|
606
|
+
python_version = "3.13"
|
|
607
|
+
strict = true
|
|
608
|
+
explicit_package_bases = true
|
|
609
|
+
warn_unreachable = true
|
|
610
|
+
disallow_any_explicit = true
|
|
611
|
+
disallow_any_unimported = true
|
|
612
|
+
disallow_any_decorated = true
|
|
613
|
+
enable_error_code = [
|
|
614
|
+
"possibly-undefined",
|
|
615
|
+
"redundant-expr",
|
|
616
|
+
"truthy-bool",
|
|
617
|
+
"truthy-iterable",
|
|
618
|
+
"exhaustive-match",
|
|
619
|
+
]
|
|
620
|
+
mypy_path = "."
|
|
621
|
+
plugins = ["pydantic.mypy"]
|
|
622
|
+
|
|
623
|
+
[tool.pydantic-mypy]
|
|
624
|
+
init_forbid_extra = true
|
|
625
|
+
init_typed = true
|
|
626
|
+
warn_required_dynamic_aliases = true
|
|
627
|
+
warn_untyped_fields = true
|
|
628
|
+
|
|
629
|
+
# Override for third-party libraries that ship without type stubs.
|
|
630
|
+
# Add libraries here only when no stubs exist (check typeshed / pypi for *-stubs).
|
|
631
|
+
[[tool.mypy.overrides]]
|
|
632
|
+
module = [
|
|
633
|
+
"some_untyped_lib",
|
|
634
|
+
"another_untyped_lib",
|
|
635
|
+
]
|
|
636
|
+
ignore_missing_imports = true
|
|
637
|
+
```
|
|
638
|
+
|
|
639
|
+
## Summary
|
|
640
|
+
|
|
641
|
+
This guide prioritizes:
|
|
642
|
+
|
|
643
|
+
1. **Type safety** - Strict mypy compliance without escape hatches
|
|
644
|
+
2. **Immutability and clarity** - Tuples, frozensets, frozen data classes, comprehensions over loops
|
|
645
|
+
3. **Performance** - Batch I/O, appropriate data structures, generators for constant memory
|
|
646
|
+
4. **Testability** - Comprehensive test coverage with fast, deterministic tests
|
|
647
|
+
|
|
648
|
+
When in doubt, consult existing code in the project for examples of these patterns in practice.
|