eticket-document-sdk 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. eticket_document_sdk-1.0.0/.gitignore +30 -0
  2. eticket_document_sdk-1.0.0/LICENSE +21 -0
  3. eticket_document_sdk-1.0.0/PKG-INFO +399 -0
  4. eticket_document_sdk-1.0.0/README.md +342 -0
  5. eticket_document_sdk-1.0.0/eticket_document_sdk/__init__.py +71 -0
  6. eticket_document_sdk-1.0.0/eticket_document_sdk/core/__init__.py +14 -0
  7. eticket_document_sdk-1.0.0/eticket_document_sdk/core/classifier.py +58 -0
  8. eticket_document_sdk-1.0.0/eticket_document_sdk/core/extractor.py +102 -0
  9. eticket_document_sdk-1.0.0/eticket_document_sdk/core/parser.py +155 -0
  10. eticket_document_sdk-1.0.0/eticket_document_sdk/core/validator.py +64 -0
  11. eticket_document_sdk-1.0.0/eticket_document_sdk/data/__init__.py +5 -0
  12. eticket_document_sdk-1.0.0/eticket_document_sdk/data/airports.py +70 -0
  13. eticket_document_sdk-1.0.0/eticket_document_sdk/exceptions/__init__.py +19 -0
  14. eticket_document_sdk-1.0.0/eticket_document_sdk/exceptions/parser_error.py +40 -0
  15. eticket_document_sdk-1.0.0/eticket_document_sdk/exceptions/validation_error.py +25 -0
  16. eticket_document_sdk-1.0.0/eticket_document_sdk/models/__init__.py +19 -0
  17. eticket_document_sdk-1.0.0/eticket_document_sdk/models/booking.py +59 -0
  18. eticket_document_sdk-1.0.0/eticket_document_sdk/models/flight.py +41 -0
  19. eticket_document_sdk-1.0.0/eticket_document_sdk/models/passenger.py +31 -0
  20. eticket_document_sdk-1.0.0/eticket_document_sdk/models/response.py +46 -0
  21. eticket_document_sdk-1.0.0/eticket_document_sdk/models/ticket.py +47 -0
  22. eticket_document_sdk-1.0.0/eticket_document_sdk/ocr/__init__.py +7 -0
  23. eticket_document_sdk-1.0.0/eticket_document_sdk/ocr/base.py +34 -0
  24. eticket_document_sdk-1.0.0/eticket_document_sdk/ocr/fallback.py +89 -0
  25. eticket_document_sdk-1.0.0/eticket_document_sdk/ocr/paddle_engine.py +104 -0
  26. eticket_document_sdk-1.0.0/eticket_document_sdk/parsers/__init__.py +23 -0
  27. eticket_document_sdk-1.0.0/eticket_document_sdk/parsers/airline/__init__.py +34 -0
  28. eticket_document_sdk-1.0.0/eticket_document_sdk/parsers/airline/generic_airline.py +69 -0
  29. eticket_document_sdk-1.0.0/eticket_document_sdk/parsers/airline/registry.py +90 -0
  30. eticket_document_sdk-1.0.0/eticket_document_sdk/parsers/airline/vietnam_airlines.py +362 -0
  31. eticket_document_sdk-1.0.0/eticket_document_sdk/parsers/base.py +33 -0
  32. eticket_document_sdk-1.0.0/eticket_document_sdk/parsers/generic/__init__.py +5 -0
  33. eticket_document_sdk-1.0.0/eticket_document_sdk/parsers/generic/parser.py +89 -0
  34. eticket_document_sdk-1.0.0/eticket_document_sdk/pdf/__init__.py +5 -0
  35. eticket_document_sdk-1.0.0/eticket_document_sdk/pdf/image_converter.py +59 -0
  36. eticket_document_sdk-1.0.0/eticket_document_sdk/pdf/pdf_reader.py +62 -0
  37. eticket_document_sdk-1.0.0/eticket_document_sdk/pdf/text_extractor.py +99 -0
  38. eticket_document_sdk-1.0.0/eticket_document_sdk/py.typed +0 -0
  39. eticket_document_sdk-1.0.0/eticket_document_sdk/schemas/__init__.py +25 -0
  40. eticket_document_sdk-1.0.0/eticket_document_sdk/schemas/pydantic_models.py +28 -0
  41. eticket_document_sdk-1.0.0/eticket_document_sdk/utils/__init__.py +18 -0
  42. eticket_document_sdk-1.0.0/eticket_document_sdk/utils/currency.py +67 -0
  43. eticket_document_sdk-1.0.0/eticket_document_sdk/utils/date_utils.py +40 -0
  44. eticket_document_sdk-1.0.0/eticket_document_sdk/utils/logging.py +49 -0
  45. eticket_document_sdk-1.0.0/eticket_document_sdk/utils/regex.py +141 -0
  46. eticket_document_sdk-1.0.0/examples/basic_usage.py +36 -0
  47. eticket_document_sdk-1.0.0/examples/batch_parse.py +51 -0
  48. eticket_document_sdk-1.0.0/examples/custom_parser.py +80 -0
  49. eticket_document_sdk-1.0.0/examples/ocr_fallback.py +38 -0
  50. eticket_document_sdk-1.0.0/pyproject.toml +87 -0
  51. eticket_document_sdk-1.0.0/tests/__init__.py +0 -0
  52. eticket_document_sdk-1.0.0/tests/conftest.py +38 -0
  53. eticket_document_sdk-1.0.0/tests/fixtures/F65A7Y.pdf +0 -0
  54. eticket_document_sdk-1.0.0/tests/fixtures/vn_F65A7Y.txt +173 -0
  55. eticket_document_sdk-1.0.0/tests/test_generic_parsers.py +50 -0
  56. eticket_document_sdk-1.0.0/tests/test_image_converter.py +29 -0
  57. eticket_document_sdk-1.0.0/tests/test_ocr_engine.py +112 -0
  58. eticket_document_sdk-1.0.0/tests/test_ocr_fallback.py +77 -0
  59. eticket_document_sdk-1.0.0/tests/test_public_api.py +66 -0
  60. eticket_document_sdk-1.0.0/tests/test_regex.py +63 -0
  61. eticket_document_sdk-1.0.0/tests/test_registry_and_plugins.py +71 -0
  62. eticket_document_sdk-1.0.0/tests/test_schemas_and_logging.py +31 -0
  63. eticket_document_sdk-1.0.0/tests/test_text_extraction.py +60 -0
  64. eticket_document_sdk-1.0.0/tests/test_utils.py +79 -0
  65. eticket_document_sdk-1.0.0/tests/test_validation.py +69 -0
  66. eticket_document_sdk-1.0.0/tests/test_vietnam_airlines_parser.py +107 -0
@@ -0,0 +1,30 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *.egg-info/
5
+ *.egg
6
+ .eggs/
7
+ build/
8
+ dist/
9
+ *.so
10
+
11
+ # Virtual environments
12
+ .venv/
13
+ venv/
14
+ env/
15
+
16
+ # Testing / coverage
17
+ .pytest_cache/
18
+ .coverage
19
+ .coverage.*
20
+ htmlcov/
21
+ coverage.xml
22
+
23
+ # Tooling caches
24
+ .ruff_cache/
25
+ .mypy_cache/
26
+
27
+ # OS / editor
28
+ .DS_Store
29
+ .idea/
30
+ .vscode/
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 eticket-document-sdk contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,399 @@
1
+ Metadata-Version: 2.4
2
+ Name: eticket-document-sdk
3
+ Version: 1.0.0
4
+ Summary: Parse airline e-ticket PDFs into strongly-typed JSON with high accuracy.
5
+ Project-URL: Homepage, https://github.com/your-org/eticket-document-sdk
6
+ Project-URL: Repository, https://github.com/your-org/eticket-document-sdk
7
+ Project-URL: Issues, https://github.com/your-org/eticket-document-sdk/issues
8
+ Author: eticket-document-sdk contributors
9
+ License: MIT License
10
+
11
+ Copyright (c) 2026 eticket-document-sdk contributors
12
+
13
+ Permission is hereby granted, free of charge, to any person obtaining a copy
14
+ of this software and associated documentation files (the "Software"), to deal
15
+ in the Software without restriction, including without limitation the rights
16
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
17
+ copies of the Software, and to permit persons to whom the Software is
18
+ furnished to do so, subject to the following conditions:
19
+
20
+ The above copyright notice and this permission notice shall be included in all
21
+ copies or substantial portions of the Software.
22
+
23
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
24
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
26
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
28
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
29
+ SOFTWARE.
30
+ License-File: LICENSE
31
+ Keywords: airline,e-ticket,itinerary,ocr,parser,pdf,pnr
32
+ Classifier: Development Status :: 5 - Production/Stable
33
+ Classifier: Intended Audience :: Developers
34
+ Classifier: License :: OSI Approved :: MIT License
35
+ Classifier: Programming Language :: Python :: 3
36
+ Classifier: Programming Language :: Python :: 3.10
37
+ Classifier: Programming Language :: Python :: 3.11
38
+ Classifier: Programming Language :: Python :: 3.12
39
+ Classifier: Programming Language :: Python :: 3.13
40
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
41
+ Classifier: Topic :: Text Processing :: General
42
+ Requires-Python: >=3.10
43
+ Requires-Dist: pdfplumber>=0.10
44
+ Requires-Dist: pydantic<3.0,>=2.5
45
+ Requires-Dist: pymupdf>=1.23
46
+ Requires-Dist: python-dateutil>=2.8
47
+ Provides-Extra: dev
48
+ Requires-Dist: build>=1.0; extra == 'dev'
49
+ Requires-Dist: pytest-cov>=4.1; extra == 'dev'
50
+ Requires-Dist: pytest>=7.4; extra == 'dev'
51
+ Requires-Dist: ruff>=0.5; extra == 'dev'
52
+ Provides-Extra: ocr
53
+ Requires-Dist: numpy>=1.24; extra == 'ocr'
54
+ Requires-Dist: opencv-python>=4.8; extra == 'ocr'
55
+ Requires-Dist: paddleocr>=2.7; extra == 'ocr'
56
+ Description-Content-Type: text/markdown
57
+
58
+ # eticket-document-sdk
59
+
60
+ Parse airline e-ticket PDFs and convert them into **strongly-typed JSON** with
61
+ very high accuracy.
62
+
63
+ The SDK primarily targets PDFs that already contain a **text layer** (the common
64
+ case for e-ticket receipts, booking confirmations and itineraries). **OCR is
65
+ used only as a fallback** when text extraction is insufficient.
66
+
67
+ ```python
68
+ from eticket_document_sdk import ETicketParser
69
+
70
+ parser = ETicketParser()
71
+ booking = parser.parse("ticket.pdf")
72
+ print(booking.model_dump())
73
+ ```
74
+
75
+ ```json
76
+ {
77
+ "booking_code": "F65A7Y",
78
+ "ticket_number": "7382421531079",
79
+ "passenger": { "first_name": "YUMIKO", "last_name": "KOHNO" },
80
+ "currency": "JPY",
81
+ "total_price": 467510,
82
+ "segments": [{ "flight_number": "VN311", "origin": "NRT", "destination": "HAN" }]
83
+ }
84
+ ```
85
+
86
+ ---
87
+
88
+ ## Table of Contents
89
+
90
+ - [Installation](#installation)
91
+ - [Quick Start](#quick-start)
92
+ - [Advanced Usage](#advanced-usage)
93
+ - [Data Model](#data-model)
94
+ - [Parser Architecture](#parser-architecture)
95
+ - [Custom Parser Plugin](#custom-parser-plugin)
96
+ - [Error Handling](#error-handling)
97
+ - [Development Guide](#development-guide)
98
+ - [Release Guide](#release-guide)
99
+ - [Roadmap](#roadmap)
100
+
101
+ ---
102
+
103
+ ## Installation
104
+
105
+ ```bash
106
+ pip install eticket-document-sdk
107
+ ```
108
+
109
+ Base install includes PDF text extraction (PyMuPDF + pdfplumber). The OCR
110
+ fallback backend is a heavy optional extra:
111
+
112
+ ```bash
113
+ # Only needed for scanned / image-only PDFs
114
+ pip install "eticket-document-sdk[ocr]"
115
+ ```
116
+
117
+ | Extra | Pulls in | When you need it |
118
+ |-------|----------|------------------|
119
+ | *(base)* | `pydantic`, `pymupdf`, `pdfplumber`, `python-dateutil` | Text-layer PDFs (the common case) |
120
+ | `ocr` | `paddleocr`, `opencv-python`, `numpy` | Scanned/image PDFs requiring OCR |
121
+ | `dev` | `pytest`, `pytest-cov`, `ruff`, `build` | Contributing / running tests |
122
+
123
+ Requires Python 3.10+.
124
+
125
+ ---
126
+
127
+ ## Quick Start
128
+
129
+ ```python
130
+ from eticket_document_sdk import ETicketParser
131
+
132
+ parser = ETicketParser()
133
+
134
+ # 1) From a file path (PDF or image)
135
+ booking = parser.parse("ticket.pdf")
136
+
137
+ # 2) From raw bytes
138
+ booking = parser.parse_bytes(pdf_bytes)
139
+
140
+ # 3) From already-extracted text (no PDF/OCR needed)
141
+ booking = parser.parse_text(raw_text)
142
+
143
+ print(booking.model_dump()) # python types
144
+ print(booking.model_dump(mode="json")) # JSON-serializable (datetimes -> str)
145
+
146
+ # Convenience top-level accessors:
147
+ booking.booking_code # "F65A7Y"
148
+ booking.ticket_number # "7382421531079"
149
+ booking.currency # "JPY"
150
+ booking.total_price # 467510
151
+ booking.passenger.full_name
152
+ booking.segments[0].origin
153
+ ```
154
+
155
+ ---
156
+
157
+ ## Advanced Usage
158
+
159
+ ### Configuration
160
+
161
+ ```python
162
+ parser = ETicketParser(
163
+ enable_ocr=True, # allow OCR fallback (default True)
164
+ ocr_langs=["vi", "en", "ja"],# OCR languages (Vietnamese/English/Japanese)
165
+ debug=False, # enable DEBUG logging for the SDK
166
+ strict_validation=False, # raise ValidationError on invalid output
167
+ text_quality_threshold=0.35, # below this, OCR fallback kicks in
168
+ )
169
+ ```
170
+
171
+ ### Detailed result envelope
172
+
173
+ `parse()` returns a `Booking`. For metadata about *how* the document was parsed,
174
+ use the `*_detailed` variants, which return a `ParseResult`:
175
+
176
+ ```python
177
+ result = parser.parse_detailed("ticket.pdf")
178
+
179
+ result.success # True
180
+ result.parser # "vietnam_airlines"
181
+ result.extraction_method # ExtractionMethod.TEXT_LAYER | OCR | RAW_TEXT
182
+ result.confidence # 0.0 .. 1.0 completeness heuristic
183
+ result.warnings # list of non-fatal validation issues
184
+ result.booking # the Booking object
185
+ ```
186
+
187
+ Also available: `parse_bytes_detailed(...)` and `parse_text_detailed(...)`.
188
+
189
+ ### Thread-safety & performance
190
+
191
+ A single `ETicketParser` instance is **reusable and thread-safe**. The
192
+ PaddleOCR model, compiled regexes and the parser registry are created once and
193
+ shared, so models are never reloaded between calls. Create one instance and
194
+ reuse it (e.g. for batch processing — see `examples/batch_parse.py`).
195
+
196
+ ---
197
+
198
+ ## Data Model
199
+
200
+ All models are **Pydantic v2** (`eticket_document_sdk.models`, re-exported from
201
+ `eticket_document_sdk.schemas.pydantic_models`).
202
+
203
+ ```
204
+ Booking
205
+ ├── status: TICKETED | CONFIRMED | PENDING | CANCELLED | UNKNOWN
206
+ ├── passenger: Passenger
207
+ │ ├── title, first_name, last_name, full_name
208
+ │ ├── membership_number # frequent-flyer number
209
+ │ └── passenger_type # ADT | CHD | INF
210
+ ├── segments: list[FlightSegment]
211
+ │ ├── segment_id, flight_number, airline
212
+ │ ├── origin, destination # IATA codes
213
+ │ ├── departure_datetime, arrival_datetime
214
+ │ ├── departure_terminal, arrival_terminal
215
+ │ ├── cabin_class, fare_basis, seat
216
+ │ ├── free_baggage, status
217
+ ├── ticket: Ticket
218
+ │ ├── booking_code, ticket_number, currency
219
+ │ ├── base_fare, taxes, total_price
220
+ │ ├── tax_breakdown: list[TaxItem] # {code, amount}
221
+ │ ├── fare_basis, fare_calculation, endorsement
222
+ │ ├── issue_date, payment
223
+ └── booking_code / ticket_number / currency / total_price # convenience mirrors
224
+ ```
225
+
226
+ ---
227
+
228
+ ## Parser Architecture
229
+
230
+ The SDK follows a clean, layered pipeline:
231
+
232
+ ```
233
+ ┌─────────────────────────────────────────────────────────┐
234
+ source ──▶ │ 1. detect type (PDF / IMAGE / TEXT) core.classifier│
235
+ │ 2. extract text layer (PyMuPDF → pdfplumber) pdf.* │
236
+ │ 3. measure quality pdf.text_extr. │
237
+ │ 4. OCR fallback if needed (PaddleOCR) ocr.* │
238
+ │ 5. classify airline → pick parser core.classifier│
239
+ │ 6. parse fields parsers.* │
240
+ │ 7. validate (business rules) core.validator │
241
+ │ 8. return Pydantic Booking models.* │
242
+ └─────────────────────────────────────────────────────────┘
243
+ ```
244
+
245
+ Key design points:
246
+
247
+ - **Text-first.** OCR is only attempted when the extracted text layer falls
248
+ below the quality threshold. PyMuPDF is the primary extractor; pdfplumber is
249
+ the fallback engine.
250
+ - **Lazy heavy deps.** PyMuPDF, pdfplumber, PaddleOCR, OpenCV and numpy are all
251
+ imported lazily, so importing the SDK is cheap and the OCR stack is optional.
252
+ - **Centralized regex** (`utils/regex.py`) compiled once at import time.
253
+ - **Pluggable parsers** via a thread-safe registry (see below).
254
+
255
+ ---
256
+
257
+ ## Custom Parser Plugin
258
+
259
+ Add a new airline **without modifying the SDK core**. Implement `BaseParser`
260
+ and register it:
261
+
262
+ ```python
263
+ from eticket_document_sdk import (
264
+ BaseParser, Booking, Ticket, FlightSegment, Passenger, register_parser,
265
+ )
266
+ from eticket_document_sdk.parsers.generic.parser import GenericExtractors
267
+
268
+
269
+ class JapanAirlinesParser(BaseParser):
270
+ code = "JL"
271
+ name = "japan_airlines"
272
+
273
+ def can_parse(self, text: str) -> float:
274
+ # Return 0..1 confidence that this layout is yours.
275
+ return 0.9 if "JAPAN AIRLINES" in text.upper() else 0.0
276
+
277
+ def parse(self, text: str) -> Booking:
278
+ return Booking(
279
+ ticket=Ticket(
280
+ booking_code=GenericExtractors.find_pnr(text),
281
+ ticket_number=GenericExtractors.find_ticket_number(text),
282
+ currency=GenericExtractors.find_currency(text),
283
+ ),
284
+ passenger=Passenger(),
285
+ segments=[],
286
+ )
287
+
288
+
289
+ register_parser("JL", JapanAirlinesParser())
290
+
291
+ # From now on ETicketParser auto-selects it for matching tickets.
292
+ ```
293
+
294
+ The classifier calls `can_parse()` on every registered parser and selects the
295
+ highest scorer; airline-specific parsers always win ties against the generic
296
+ fallback. See `examples/custom_parser.py` for a runnable version.
297
+
298
+ This is the same mechanism by which future airlines (ANA, Air France, Emirates,
299
+ Singapore Airlines, …) are added.
300
+
301
+ ---
302
+
303
+ ## Error Handling
304
+
305
+ All exceptions derive from `ETicketSDKError`:
306
+
307
+ | Exception | Raised when |
308
+ |-----------|-------------|
309
+ | `DocumentReadError` | File can't be opened/read, or is empty/corrupt |
310
+ | `UnsupportedDocumentError` | Input type/layout not supported |
311
+ | `OCRFailedError` | OCR needed but disabled/unavailable, or produced nothing |
312
+ | `ParserError` | No parser matched, or a parser failed |
313
+ | `ValidationError` | Output failed business-rule validation (strict mode) |
314
+
315
+ ```python
316
+ from eticket_document_sdk import ETicketParser, ETicketSDKError
317
+
318
+ try:
319
+ booking = ETicketParser().parse("ticket.pdf")
320
+ except ETicketSDKError as exc:
321
+ print(f"Failed: {exc}")
322
+ ```
323
+
324
+ Validation (`core/validator.py`) checks ticket-number format (13 digits), PNR
325
+ format (6 alphanumerics), currency code, flight-number format and segment
326
+ presence. In non-strict mode, issues are returned as `result.warnings`; in
327
+ `strict_validation=True` mode a `ValidationError` is raised.
328
+
329
+ ---
330
+
331
+ ## Development Guide
332
+
333
+ ```bash
334
+ git clone https://github.com/your-org/eticket-document-sdk
335
+ cd eticket-document-sdk
336
+
337
+ python -m venv .venv && source .venv/bin/activate
338
+ pip install -e ".[dev]"
339
+
340
+ # Run the test suite with coverage (target: 90%+)
341
+ pytest --cov=eticket_document_sdk --cov-report=term-missing
342
+
343
+ # Lint / format
344
+ ruff check .
345
+ ruff format .
346
+ ```
347
+
348
+ Tests use a bundled real Vietnam Airlines fixture (`tests/fixtures/`). OCR-path
349
+ tests use a fake engine, so **PaddleOCR is not required to run the suite**.
350
+
351
+ Project layout mirrors the pipeline: `core/` (orchestration), `pdf/`
352
+ (extraction), `ocr/` (fallback), `parsers/` (plugins), `models/` + `schemas/`
353
+ (Pydantic), `utils/`, `exceptions/`.
354
+
355
+ ---
356
+
357
+ ## Release Guide
358
+
359
+ 1. Update the version in `pyproject.toml` and `eticket_document_sdk/__init__.py`
360
+ (`__version__`).
361
+ 2. Update the changelog / release notes.
362
+ 3. Run the full test suite and linters; ensure coverage ≥ 90%.
363
+ 4. Build the distributions:
364
+ ```bash
365
+ python -m build
366
+ ```
367
+ This produces `dist/eticket_document_sdk-<version>-py3-none-any.whl` and the
368
+ sdist `.tar.gz`.
369
+ 5. Smoke-test the wheel in a clean virtualenv:
370
+ ```bash
371
+ pip install dist/eticket_document_sdk-*.whl
372
+ ```
373
+ 6. Publish:
374
+ ```bash
375
+ python -m twine upload dist/*
376
+ ```
377
+ 7. Tag the release: `git tag v<version> && git push --tags`.
378
+
379
+ ---
380
+
381
+ ## Roadmap
382
+
383
+ The extractor/parser boundary is deliberately abstract so additional backends
384
+ can be added **without breaking the public API**:
385
+
386
+ - **Cloud OCR / Document AI**: Azure Document Intelligence, AWS Textract,
387
+ Google Document AI — pluggable as alternative `OCREngine` implementations or
388
+ text providers.
389
+ - **LLM structured outputs**: OpenAI and Claude structured outputs as
390
+ alternative parser plugins behind the same `BaseParser` contract.
391
+
392
+ Because selection happens via the registry + classifier, these can be layered in
393
+ as new strategies while `ETicketParser.parse(...)` stays unchanged.
394
+
395
+ ---
396
+
397
+ ## License
398
+
399
+ MIT — see [LICENSE](LICENSE).