eticket-document-sdk 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. eticket_document_sdk/__init__.py +71 -0
  2. eticket_document_sdk/core/__init__.py +14 -0
  3. eticket_document_sdk/core/classifier.py +58 -0
  4. eticket_document_sdk/core/extractor.py +102 -0
  5. eticket_document_sdk/core/parser.py +155 -0
  6. eticket_document_sdk/core/validator.py +64 -0
  7. eticket_document_sdk/data/__init__.py +5 -0
  8. eticket_document_sdk/data/airports.py +70 -0
  9. eticket_document_sdk/exceptions/__init__.py +19 -0
  10. eticket_document_sdk/exceptions/parser_error.py +40 -0
  11. eticket_document_sdk/exceptions/validation_error.py +25 -0
  12. eticket_document_sdk/models/__init__.py +19 -0
  13. eticket_document_sdk/models/booking.py +59 -0
  14. eticket_document_sdk/models/flight.py +41 -0
  15. eticket_document_sdk/models/passenger.py +31 -0
  16. eticket_document_sdk/models/response.py +46 -0
  17. eticket_document_sdk/models/ticket.py +47 -0
  18. eticket_document_sdk/ocr/__init__.py +7 -0
  19. eticket_document_sdk/ocr/base.py +34 -0
  20. eticket_document_sdk/ocr/fallback.py +89 -0
  21. eticket_document_sdk/ocr/paddle_engine.py +104 -0
  22. eticket_document_sdk/parsers/__init__.py +23 -0
  23. eticket_document_sdk/parsers/airline/__init__.py +34 -0
  24. eticket_document_sdk/parsers/airline/generic_airline.py +69 -0
  25. eticket_document_sdk/parsers/airline/registry.py +90 -0
  26. eticket_document_sdk/parsers/airline/vietnam_airlines.py +362 -0
  27. eticket_document_sdk/parsers/base.py +33 -0
  28. eticket_document_sdk/parsers/generic/__init__.py +5 -0
  29. eticket_document_sdk/parsers/generic/parser.py +89 -0
  30. eticket_document_sdk/pdf/__init__.py +5 -0
  31. eticket_document_sdk/pdf/image_converter.py +59 -0
  32. eticket_document_sdk/pdf/pdf_reader.py +62 -0
  33. eticket_document_sdk/pdf/text_extractor.py +99 -0
  34. eticket_document_sdk/py.typed +0 -0
  35. eticket_document_sdk/schemas/__init__.py +25 -0
  36. eticket_document_sdk/schemas/pydantic_models.py +28 -0
  37. eticket_document_sdk/utils/__init__.py +18 -0
  38. eticket_document_sdk/utils/currency.py +67 -0
  39. eticket_document_sdk/utils/date_utils.py +40 -0
  40. eticket_document_sdk/utils/logging.py +49 -0
  41. eticket_document_sdk/utils/regex.py +141 -0
  42. eticket_document_sdk-1.0.0.dist-info/METADATA +399 -0
  43. eticket_document_sdk-1.0.0.dist-info/RECORD +45 -0
  44. eticket_document_sdk-1.0.0.dist-info/WHEEL +4 -0
  45. eticket_document_sdk-1.0.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,71 @@
1
+ """eticket-document-sdk.
2
+
3
+ Parse airline e-ticket PDFs into strongly-typed JSON.
4
+
5
+ Quick start::
6
+
7
+ from eticket_document_sdk import ETicketParser
8
+
9
+ parser = ETicketParser()
10
+ booking = parser.parse("ticket.pdf")
11
+ print(booking.model_dump())
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ from .core.parser import ETicketParser
17
+ from .exceptions import (
18
+ DocumentReadError,
19
+ ETicketSDKError,
20
+ OCRFailedError,
21
+ ParserError,
22
+ UnsupportedDocumentError,
23
+ ValidationError,
24
+ )
25
+ from .models import (
26
+ Booking,
27
+ BookingStatus,
28
+ DocumentType,
29
+ ExtractionMethod,
30
+ FlightSegment,
31
+ ParseResult,
32
+ Passenger,
33
+ TaxItem,
34
+ Ticket,
35
+ )
36
+ from .parsers.airline.registry import (
37
+ ParserRegistry,
38
+ get_default_registry,
39
+ register_parser,
40
+ )
41
+ from .parsers.base import BaseParser
42
+
43
+ __version__ = "1.0.0"
44
+
45
+ __all__ = [
46
+ "__version__",
47
+ # Public entry point
48
+ "ETicketParser",
49
+ # Plugin system
50
+ "BaseParser",
51
+ "ParserRegistry",
52
+ "register_parser",
53
+ "get_default_registry",
54
+ # Models
55
+ "Booking",
56
+ "BookingStatus",
57
+ "FlightSegment",
58
+ "Passenger",
59
+ "Ticket",
60
+ "TaxItem",
61
+ "ParseResult",
62
+ "DocumentType",
63
+ "ExtractionMethod",
64
+ # Exceptions
65
+ "ETicketSDKError",
66
+ "DocumentReadError",
67
+ "UnsupportedDocumentError",
68
+ "OCRFailedError",
69
+ "ParserError",
70
+ "ValidationError",
71
+ ]
@@ -0,0 +1,14 @@
1
+ """Core pipeline: parser orchestration, extraction, classification, validation."""
2
+
3
+ from .classifier import DocumentClassifier
4
+ from .extractor import DocumentExtractor, ExtractionResult
5
+ from .parser import ETicketParser
6
+ from .validator import DocumentValidator
7
+
8
+ __all__ = [
9
+ "ETicketParser",
10
+ "DocumentExtractor",
11
+ "ExtractionResult",
12
+ "DocumentClassifier",
13
+ "DocumentValidator",
14
+ ]
@@ -0,0 +1,58 @@
1
+ """Document classification: choose the right parser for extracted text."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from ..models import DocumentType
6
+ from ..parsers.airline.registry import ParserRegistry, get_default_registry
7
+ from ..parsers.base import BaseParser
8
+ from ..pdf import pdf_reader
9
+ from ..utils.logging import get_logger
10
+
11
+ _logger = get_logger("core.classifier")
12
+
13
+ # Common image extensions for type detection by filename.
14
+ _IMAGE_EXTS = {".png", ".jpg", ".jpeg", ".tif", ".tiff", ".bmp", ".webp"}
15
+
16
+ # Magic-byte signatures for common image formats.
17
+ _IMAGE_MAGIC = (
18
+ b"\x89PNG\r\n", # PNG
19
+ b"\xff\xd8\xff", # JPEG
20
+ b"GIF8", # GIF
21
+ b"BM", # BMP
22
+ )
23
+
24
+
25
+ class DocumentClassifier:
26
+ """Detects input type and selects the best parser for the content."""
27
+
28
+ def __init__(self, registry: ParserRegistry | None = None) -> None:
29
+ self._registry = registry or get_default_registry()
30
+
31
+ # -- input type detection ------------------------------------------- #
32
+ @staticmethod
33
+ def detect_type_from_bytes(data: bytes) -> DocumentType:
34
+ if pdf_reader.is_pdf_bytes(data):
35
+ return DocumentType.PDF
36
+ if any(data[:8].startswith(magic) for magic in _IMAGE_MAGIC):
37
+ return DocumentType.IMAGE
38
+ return DocumentType.UNKNOWN
39
+
40
+ @staticmethod
41
+ def detect_type_from_path(path: str) -> DocumentType:
42
+ lower = path.lower()
43
+ if lower.endswith(".pdf"):
44
+ return DocumentType.PDF
45
+ if any(lower.endswith(ext) for ext in _IMAGE_EXTS):
46
+ return DocumentType.IMAGE
47
+ if lower.endswith((".txt", ".text")):
48
+ return DocumentType.TEXT
49
+ return DocumentType.UNKNOWN
50
+
51
+ # -- parser selection ----------------------------------------------- #
52
+ def classify(self, text: str) -> tuple[BaseParser | None, float]:
53
+ """Return the best-matching parser for ``text`` and its confidence."""
54
+
55
+ parser, score = self._registry.best_match(text)
56
+ if parser:
57
+ _logger.debug("Classified as %s (score=%.3f)", parser.name, score)
58
+ return parser, score
@@ -0,0 +1,102 @@
1
+ """Turn a raw input source into text, choosing text-layer vs OCR.
2
+
3
+ This is the implementation of pipeline steps 1-4: detect type, extract the text
4
+ layer, measure quality, and fall back to OCR only when necessary.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from dataclasses import dataclass
10
+
11
+ from ..exceptions import DocumentReadError, UnsupportedDocumentError
12
+ from ..models import DocumentType, ExtractionMethod
13
+ from ..ocr.fallback import OCRFallback
14
+ from ..pdf import text_extractor
15
+ from ..utils.logging import get_logger
16
+ from .classifier import DocumentClassifier
17
+
18
+ _logger = get_logger("core.extractor")
19
+
20
+
21
+ @dataclass
22
+ class ExtractionResult:
23
+ """Outcome of the text-acquisition stage."""
24
+
25
+ text: str
26
+ document_type: DocumentType
27
+ method: ExtractionMethod
28
+ quality: float
29
+
30
+
31
+ class DocumentExtractor:
32
+ """Acquires text from PDF / image / raw-text inputs."""
33
+
34
+ def __init__(
35
+ self,
36
+ *,
37
+ ocr: OCRFallback,
38
+ classifier: DocumentClassifier | None = None,
39
+ text_quality_threshold: float = 0.35,
40
+ min_text_length: int = 80,
41
+ ) -> None:
42
+ self._ocr = ocr
43
+ self._classifier = classifier or DocumentClassifier()
44
+ self._threshold = text_quality_threshold
45
+ self._min_length = min_text_length
46
+
47
+ def extract_from_bytes(
48
+ self, data: bytes, *, document_type: DocumentType | None = None
49
+ ) -> ExtractionResult:
50
+ doc_type = document_type or self._classifier.detect_type_from_bytes(data)
51
+
52
+ if doc_type == DocumentType.PDF:
53
+ return self._extract_pdf(data)
54
+ if doc_type == DocumentType.IMAGE:
55
+ return self._extract_image(data)
56
+ raise UnsupportedDocumentError(
57
+ f"Unsupported document type: {doc_type.value}. "
58
+ "Provide a PDF, an image, or use parse_text() for raw text."
59
+ )
60
+
61
+ def extract_from_text(self, text: str) -> ExtractionResult:
62
+ if not text or not text.strip():
63
+ raise DocumentReadError("Empty text supplied to parse_text().")
64
+ return ExtractionResult(
65
+ text=text,
66
+ document_type=DocumentType.TEXT,
67
+ method=ExtractionMethod.RAW_TEXT,
68
+ quality=text_extractor.measure_quality(text, min_length=self._min_length),
69
+ )
70
+
71
+ # -- internals ------------------------------------------------------ #
72
+ def _extract_pdf(self, data: bytes) -> ExtractionResult:
73
+ text = text_extractor.extract_text(data)
74
+ quality = text_extractor.measure_quality(text, min_length=self._min_length)
75
+ _logger.debug("PDF text-layer quality=%.3f length=%d", quality, len(text))
76
+
77
+ if quality >= self._threshold and len(text) >= self._min_length:
78
+ return ExtractionResult(
79
+ text=text,
80
+ document_type=DocumentType.PDF,
81
+ method=ExtractionMethod.TEXT_LAYER,
82
+ quality=quality,
83
+ )
84
+
85
+ _logger.info("Text layer insufficient (q=%.3f); attempting OCR fallback", quality)
86
+ ocr_text = self._ocr.run_on_pdf(data)
87
+ return ExtractionResult(
88
+ text=ocr_text,
89
+ document_type=DocumentType.PDF,
90
+ method=ExtractionMethod.OCR,
91
+ quality=text_extractor.measure_quality(ocr_text, min_length=self._min_length),
92
+ )
93
+
94
+ def _extract_image(self, data: bytes) -> ExtractionResult:
95
+ # Images have no text layer; go straight to OCR.
96
+ ocr_text = self._ocr.run_on_image(data)
97
+ return ExtractionResult(
98
+ text=ocr_text,
99
+ document_type=DocumentType.IMAGE,
100
+ method=ExtractionMethod.OCR,
101
+ quality=text_extractor.measure_quality(ocr_text, min_length=self._min_length),
102
+ )
@@ -0,0 +1,155 @@
1
+ """Public SDK entry point: :class:`ETicketParser`.
2
+
3
+ Orchestrates the full pipeline (detect -> extract -> classify -> parse ->
4
+ validate -> return) behind a deliberately tiny API:
5
+
6
+ parser = ETicketParser()
7
+ booking = parser.parse("ticket.pdf")
8
+ print(booking.model_dump())
9
+
10
+ The parser instance is reusable and thread-safe: the OCR model, compiled regex
11
+ and parser registry are all created once and shared.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import logging
17
+ from pathlib import Path
18
+
19
+ from ..exceptions import ParserError
20
+ from ..models import Booking, ParseResult
21
+ from ..ocr.fallback import OCRFallback
22
+ from ..parsers.airline.registry import ParserRegistry, get_default_registry
23
+ from ..utils.logging import configure_logging, get_logger
24
+ from .classifier import DocumentClassifier
25
+ from .extractor import DocumentExtractor, ExtractionResult
26
+ from .validator import DocumentValidator
27
+
28
+ _logger = get_logger("core.parser")
29
+
30
+ # Fields used to compute a heuristic confidence/completeness score.
31
+ _KEY_FIELDS = ("booking_code", "ticket_number", "currency", "total_price")
32
+
33
+
34
+ class ETicketParser:
35
+ """Parse airline e-ticket documents into strongly-typed :class:`Booking`s."""
36
+
37
+ def __init__(
38
+ self,
39
+ *,
40
+ enable_ocr: bool = True,
41
+ ocr_langs: list[str] | None = None,
42
+ debug: bool = False,
43
+ strict_validation: bool = False,
44
+ text_quality_threshold: float = 0.35,
45
+ registry: ParserRegistry | None = None,
46
+ ) -> None:
47
+ if debug:
48
+ configure_logging(logging.DEBUG)
49
+
50
+ self.debug = debug
51
+ self.strict_validation = strict_validation
52
+ self._registry = registry or get_default_registry()
53
+
54
+ self._ocr = OCRFallback(
55
+ enabled=enable_ocr, languages=ocr_langs or ["vi", "en", "ja"]
56
+ )
57
+ self._classifier = DocumentClassifier(self._registry)
58
+ self._extractor = DocumentExtractor(
59
+ ocr=self._ocr,
60
+ classifier=self._classifier,
61
+ text_quality_threshold=text_quality_threshold,
62
+ )
63
+ self._validator = DocumentValidator()
64
+
65
+ # ------------------------------------------------------------------ #
66
+ # Public API (returns Booking)
67
+ # ------------------------------------------------------------------ #
68
+ def parse(self, source: str | Path) -> Booking:
69
+ """Parse a document from a file path. Returns a :class:`Booking`."""
70
+
71
+ return self.parse_detailed(source).booking # type: ignore[return-value]
72
+
73
+ def parse_bytes(self, data: bytes) -> Booking:
74
+ """Parse a document from raw bytes (PDF or image)."""
75
+
76
+ return self.parse_bytes_detailed(data).booking # type: ignore[return-value]
77
+
78
+ def parse_text(self, text: str) -> Booking:
79
+ """Parse already-extracted raw text (no PDF/OCR involved)."""
80
+
81
+ return self.parse_text_detailed(text).booking # type: ignore[return-value]
82
+
83
+ # ------------------------------------------------------------------ #
84
+ # Public API (returns ParseResult envelope)
85
+ # ------------------------------------------------------------------ #
86
+ def parse_detailed(self, source: str | Path) -> ParseResult:
87
+ """Like :meth:`parse` but returns the full :class:`ParseResult`."""
88
+
89
+ from ..pdf import pdf_reader
90
+
91
+ path = str(source)
92
+ data = pdf_reader.read_bytes(path)
93
+ doc_type = self._classifier.detect_type_from_path(path)
94
+ if doc_type.name == "TEXT":
95
+ return self.parse_text_detailed(data.decode("utf-8", errors="replace"))
96
+ extraction = self._extractor.extract_from_bytes(
97
+ data, document_type=None if doc_type.name == "UNKNOWN" else doc_type
98
+ )
99
+ return self._finish(extraction)
100
+
101
+ def parse_bytes_detailed(self, data: bytes) -> ParseResult:
102
+ extraction = self._extractor.extract_from_bytes(data)
103
+ return self._finish(extraction)
104
+
105
+ def parse_text_detailed(self, text: str) -> ParseResult:
106
+ extraction = self._extractor.extract_from_text(text)
107
+ return self._finish(extraction)
108
+
109
+ # ------------------------------------------------------------------ #
110
+ # Pipeline tail: classify -> parse -> validate -> envelope
111
+ # ------------------------------------------------------------------ #
112
+ def _finish(self, extraction: ExtractionResult) -> ParseResult:
113
+ parser, score = self._classifier.classify(extraction.text)
114
+ if parser is None:
115
+ raise ParserError(
116
+ "No parser could handle this document. "
117
+ "Register an airline parser or check the input."
118
+ )
119
+
120
+ _logger.info(
121
+ "Parsing with %s (match=%.3f, method=%s)",
122
+ parser.name,
123
+ score,
124
+ extraction.method.value,
125
+ )
126
+ try:
127
+ booking = parser.parse(extraction.text)
128
+ except ParserError:
129
+ raise
130
+ except Exception as exc: # surface parser bugs as ParserError
131
+ raise ParserError(
132
+ f"Parser {parser.name!r} failed.", details=str(exc)
133
+ ) from exc
134
+
135
+ warnings = self._validator.validate(booking, strict=self.strict_validation)
136
+ confidence = _completeness(booking)
137
+
138
+ return ParseResult(
139
+ success=True,
140
+ document_type=extraction.document_type,
141
+ extraction_method=extraction.method,
142
+ parser=parser.name,
143
+ confidence=confidence,
144
+ booking=booking,
145
+ warnings=warnings,
146
+ )
147
+
148
+
149
+ def _completeness(booking: Booking) -> float:
150
+ """Fraction of key fields present, lightly weighted by segment presence."""
151
+
152
+ present = sum(1 for f in _KEY_FIELDS if getattr(booking, f, None) is not None)
153
+ field_score = present / len(_KEY_FIELDS)
154
+ segment_score = 1.0 if booking.segments else 0.0
155
+ return round(0.7 * field_score + 0.3 * segment_score, 4)
@@ -0,0 +1,64 @@
1
+ """Business-rule validation for parsed bookings.
2
+
3
+ Pydantic validates field *types*; this layer validates cross-field *business*
4
+ rules (ticket number length, currency sanity, presence of segments) and either
5
+ raises :class:`ValidationError` (strict mode) or returns a list of warnings.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import re
11
+
12
+ from ..exceptions import ValidationError
13
+ from ..models import Booking
14
+ from ..utils.currency import is_valid_currency
15
+ from ..utils.logging import get_logger
16
+
17
+ _logger = get_logger("core.validator")
18
+
19
+ _TICKET_RE = re.compile(r"^\d{13}$")
20
+ _PNR_RE = re.compile(r"^[A-Z0-9]{6}$")
21
+ _FLIGHT_RE = re.compile(r"^([A-Z]{2}|[A-Z]\d)\d{1,4}$")
22
+
23
+
24
+ class DocumentValidator:
25
+ """Validates a :class:`Booking` against airline-ticket business rules."""
26
+
27
+ def validate(self, booking: Booking, *, strict: bool = False) -> list[str]:
28
+ """Return a list of validation issues.
29
+
30
+ In ``strict`` mode, raises :class:`ValidationError` if any issue is
31
+ found; otherwise returns the issues as warnings for the caller to log.
32
+ """
33
+
34
+ issues: list[str] = []
35
+ t = booking.ticket
36
+
37
+ if not t.booking_code:
38
+ issues.append("Missing booking code (PNR).")
39
+ elif not _PNR_RE.match(t.booking_code):
40
+ issues.append(f"Invalid booking code format: {t.booking_code!r}.")
41
+
42
+ if not t.ticket_number:
43
+ issues.append("Missing ticket number.")
44
+ elif not _TICKET_RE.match(t.ticket_number):
45
+ issues.append(f"Invalid ticket number format: {t.ticket_number!r}.")
46
+
47
+ if t.currency and not is_valid_currency(t.currency):
48
+ issues.append(f"Invalid currency code: {t.currency!r}.")
49
+
50
+ if t.total_price is not None and t.total_price < 0:
51
+ issues.append("Total price is negative.")
52
+
53
+ if not booking.segments:
54
+ issues.append("No flight segments found.")
55
+ for seg in booking.segments:
56
+ if seg.flight_number and not _FLIGHT_RE.match(seg.flight_number):
57
+ issues.append(f"Invalid flight number format: {seg.flight_number!r}.")
58
+
59
+ if strict and issues:
60
+ raise ValidationError("Booking failed validation.", errors=issues)
61
+
62
+ for issue in issues:
63
+ _logger.warning("Validation: %s", issue)
64
+ return issues
@@ -0,0 +1,5 @@
1
+ """Static data tables used by parsers (airport directories, etc.)."""
2
+
3
+ from .airports import resolve_airport
4
+
5
+ __all__ = ["resolve_airport"]
@@ -0,0 +1,70 @@
1
+ """Airport name -> IATA resolution.
2
+
3
+ E-ticket receipts frequently print full airport names ("TOKYO NARITA INTL, JP")
4
+ rather than IATA codes. This module resolves those names to IATA codes using a
5
+ keyword-matching strategy that is resilient to ordering and punctuation.
6
+
7
+ The table is intentionally a curated subset of high-traffic airports plus every
8
+ airport required by the bundled test fixtures. It is data, not logic — extend it
9
+ freely without touching the parsers.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ # IATA code -> tuple of distinguishing keyword sets. A name matches a code when
15
+ # *all* keywords in any one set are present (case-insensitive) in the name.
16
+ _AIRPORT_KEYWORDS: dict[str, tuple[tuple[str, ...], ...]] = {
17
+ "NRT": (("tokyo", "narita"), ("narita",)),
18
+ "HND": (("tokyo", "haneda"), ("haneda",)),
19
+ "HAN": (("hanoi", "noi bai"), ("noi bai",)),
20
+ "SGN": (("ho chi minh", "tan son nhat"), ("tan son nhat",), ("saigon",)),
21
+ "DAD": (("da nang",), ("danang",)),
22
+ "CDG": (("paris", "charles de gaulle"), ("charles de gaulle",)),
23
+ "ORY": (("paris", "orly"),),
24
+ "LHR": (("london", "heathrow"), ("heathrow",)),
25
+ "LGW": (("london", "gatwick"),),
26
+ "FRA": (("frankfurt",),),
27
+ "AMS": (("amsterdam", "schiphol"), ("schiphol",)),
28
+ "ICN": (("seoul", "incheon"), ("incheon",)),
29
+ "GMP": (("seoul", "gimpo"),),
30
+ "PEK": (("beijing", "capital"),),
31
+ "PKX": (("beijing", "daxing"),),
32
+ "PVG": (("shanghai", "pudong"), ("pudong",)),
33
+ "HKG": (("hong kong",),),
34
+ "TPE": (("taipei", "taoyuan"), ("taoyuan",)),
35
+ "SIN": (("singapore", "changi"), ("changi",)),
36
+ "BKK": (("bangkok", "suvarnabhumi"), ("suvarnabhumi",)),
37
+ "DMK": (("bangkok", "don muang"),),
38
+ "KUL": (("kuala lumpur",),),
39
+ "SYD": (("sydney",),),
40
+ "MEL": (("melbourne",),),
41
+ "LAX": (("los angeles",),),
42
+ "SFO": (("san francisco",),),
43
+ "JFK": (("new york", "kennedy"), ("john f kennedy",)),
44
+ "DXB": (("dubai",),),
45
+ "DOH": (("doha", "hamad"),),
46
+ }
47
+
48
+
49
+ def resolve_airport(name: str | None) -> str | None:
50
+ """Resolve a printed airport name to its IATA code.
51
+
52
+ Returns the 3-letter IATA code, or ``None`` when no confident match exists
53
+ (callers then fall back to the raw name).
54
+ """
55
+
56
+ if not name:
57
+ return None
58
+
59
+ text = " ".join(name.lower().replace(",", " ").split())
60
+
61
+ # Fast path: an explicit 3-letter IATA token already present in the name.
62
+ for token in text.upper().split():
63
+ if len(token) == 3 and token.isalpha() and token in _AIRPORT_KEYWORDS:
64
+ return token
65
+
66
+ for code, keyword_sets in _AIRPORT_KEYWORDS.items():
67
+ for keywords in keyword_sets:
68
+ if all(kw in text for kw in keywords):
69
+ return code
70
+ return None
@@ -0,0 +1,19 @@
1
+ """Public exception surface for the SDK."""
2
+
3
+ from .parser_error import (
4
+ DocumentReadError,
5
+ ETicketSDKError,
6
+ OCRFailedError,
7
+ ParserError,
8
+ UnsupportedDocumentError,
9
+ )
10
+ from .validation_error import ValidationError
11
+
12
+ __all__ = [
13
+ "ETicketSDKError",
14
+ "DocumentReadError",
15
+ "UnsupportedDocumentError",
16
+ "OCRFailedError",
17
+ "ParserError",
18
+ "ValidationError",
19
+ ]
@@ -0,0 +1,40 @@
1
+ """Custom exceptions for the document/parsing pipeline.
2
+
3
+ All SDK exceptions derive from :class:`ETicketSDKError` so callers can catch the
4
+ whole family with a single ``except`` clause while still being able to handle
5
+ specific failure modes.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from typing import Any
11
+
12
+
13
+ class ETicketSDKError(Exception):
14
+ """Base class for every exception raised by the SDK."""
15
+
16
+ def __init__(self, message: str, *, details: Any | None = None) -> None:
17
+ super().__init__(message)
18
+ self.message = message
19
+ self.details = details
20
+
21
+ def __str__(self) -> str: # pragma: no cover - trivial
22
+ if self.details:
23
+ return f"{self.message} ({self.details})"
24
+ return self.message
25
+
26
+
27
+ class DocumentReadError(ETicketSDKError):
28
+ """Raised when a document cannot be opened or read (corrupt/empty file)."""
29
+
30
+
31
+ class UnsupportedDocumentError(ETicketSDKError):
32
+ """Raised when the input type or document layout is not supported."""
33
+
34
+
35
+ class OCRFailedError(ETicketSDKError):
36
+ """Raised when OCR fallback is required but unavailable or fails."""
37
+
38
+
39
+ class ParserError(ETicketSDKError):
40
+ """Raised when a parser cannot extract the minimum required fields."""
@@ -0,0 +1,25 @@
1
+ """Validation-related exceptions."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ from .parser_error import ETicketSDKError
8
+
9
+
10
+ class ValidationError(ETicketSDKError):
11
+ """Raised when extracted data fails business-rule validation.
12
+
13
+ ``errors`` carries a list of human-readable field-level problems so the
14
+ caller can surface every issue at once instead of one at a time.
15
+ """
16
+
17
+ def __init__(
18
+ self,
19
+ message: str,
20
+ *,
21
+ errors: list[str] | None = None,
22
+ details: Any | None = None,
23
+ ) -> None:
24
+ super().__init__(message, details=details)
25
+ self.errors: list[str] = errors or []
@@ -0,0 +1,19 @@
1
+ """Domain models (Pydantic v2)."""
2
+
3
+ from .booking import Booking, BookingStatus
4
+ from .flight import FlightSegment
5
+ from .passenger import Passenger
6
+ from .response import DocumentType, ExtractionMethod, ParseResult
7
+ from .ticket import TaxItem, Ticket
8
+
9
+ __all__ = [
10
+ "Passenger",
11
+ "FlightSegment",
12
+ "Ticket",
13
+ "TaxItem",
14
+ "Booking",
15
+ "BookingStatus",
16
+ "ParseResult",
17
+ "DocumentType",
18
+ "ExtractionMethod",
19
+ ]