eticket-document-sdk 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- eticket_document_sdk/__init__.py +71 -0
- eticket_document_sdk/core/__init__.py +14 -0
- eticket_document_sdk/core/classifier.py +58 -0
- eticket_document_sdk/core/extractor.py +102 -0
- eticket_document_sdk/core/parser.py +155 -0
- eticket_document_sdk/core/validator.py +64 -0
- eticket_document_sdk/data/__init__.py +5 -0
- eticket_document_sdk/data/airports.py +70 -0
- eticket_document_sdk/exceptions/__init__.py +19 -0
- eticket_document_sdk/exceptions/parser_error.py +40 -0
- eticket_document_sdk/exceptions/validation_error.py +25 -0
- eticket_document_sdk/models/__init__.py +19 -0
- eticket_document_sdk/models/booking.py +59 -0
- eticket_document_sdk/models/flight.py +41 -0
- eticket_document_sdk/models/passenger.py +31 -0
- eticket_document_sdk/models/response.py +46 -0
- eticket_document_sdk/models/ticket.py +47 -0
- eticket_document_sdk/ocr/__init__.py +7 -0
- eticket_document_sdk/ocr/base.py +34 -0
- eticket_document_sdk/ocr/fallback.py +89 -0
- eticket_document_sdk/ocr/paddle_engine.py +104 -0
- eticket_document_sdk/parsers/__init__.py +23 -0
- eticket_document_sdk/parsers/airline/__init__.py +34 -0
- eticket_document_sdk/parsers/airline/generic_airline.py +69 -0
- eticket_document_sdk/parsers/airline/registry.py +90 -0
- eticket_document_sdk/parsers/airline/vietnam_airlines.py +362 -0
- eticket_document_sdk/parsers/base.py +33 -0
- eticket_document_sdk/parsers/generic/__init__.py +5 -0
- eticket_document_sdk/parsers/generic/parser.py +89 -0
- eticket_document_sdk/pdf/__init__.py +5 -0
- eticket_document_sdk/pdf/image_converter.py +59 -0
- eticket_document_sdk/pdf/pdf_reader.py +62 -0
- eticket_document_sdk/pdf/text_extractor.py +99 -0
- eticket_document_sdk/py.typed +0 -0
- eticket_document_sdk/schemas/__init__.py +25 -0
- eticket_document_sdk/schemas/pydantic_models.py +28 -0
- eticket_document_sdk/utils/__init__.py +18 -0
- eticket_document_sdk/utils/currency.py +67 -0
- eticket_document_sdk/utils/date_utils.py +40 -0
- eticket_document_sdk/utils/logging.py +49 -0
- eticket_document_sdk/utils/regex.py +141 -0
- eticket_document_sdk-1.0.0.dist-info/METADATA +399 -0
- eticket_document_sdk-1.0.0.dist-info/RECORD +45 -0
- eticket_document_sdk-1.0.0.dist-info/WHEEL +4 -0
- eticket_document_sdk-1.0.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
"""eticket-document-sdk.
|
|
2
|
+
|
|
3
|
+
Parse airline e-ticket PDFs into strongly-typed JSON.
|
|
4
|
+
|
|
5
|
+
Quick start::
|
|
6
|
+
|
|
7
|
+
from eticket_document_sdk import ETicketParser
|
|
8
|
+
|
|
9
|
+
parser = ETicketParser()
|
|
10
|
+
booking = parser.parse("ticket.pdf")
|
|
11
|
+
print(booking.model_dump())
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
from .core.parser import ETicketParser
|
|
17
|
+
from .exceptions import (
|
|
18
|
+
DocumentReadError,
|
|
19
|
+
ETicketSDKError,
|
|
20
|
+
OCRFailedError,
|
|
21
|
+
ParserError,
|
|
22
|
+
UnsupportedDocumentError,
|
|
23
|
+
ValidationError,
|
|
24
|
+
)
|
|
25
|
+
from .models import (
|
|
26
|
+
Booking,
|
|
27
|
+
BookingStatus,
|
|
28
|
+
DocumentType,
|
|
29
|
+
ExtractionMethod,
|
|
30
|
+
FlightSegment,
|
|
31
|
+
ParseResult,
|
|
32
|
+
Passenger,
|
|
33
|
+
TaxItem,
|
|
34
|
+
Ticket,
|
|
35
|
+
)
|
|
36
|
+
from .parsers.airline.registry import (
|
|
37
|
+
ParserRegistry,
|
|
38
|
+
get_default_registry,
|
|
39
|
+
register_parser,
|
|
40
|
+
)
|
|
41
|
+
from .parsers.base import BaseParser
|
|
42
|
+
|
|
43
|
+
__version__ = "1.0.0"
|
|
44
|
+
|
|
45
|
+
__all__ = [
|
|
46
|
+
"__version__",
|
|
47
|
+
# Public entry point
|
|
48
|
+
"ETicketParser",
|
|
49
|
+
# Plugin system
|
|
50
|
+
"BaseParser",
|
|
51
|
+
"ParserRegistry",
|
|
52
|
+
"register_parser",
|
|
53
|
+
"get_default_registry",
|
|
54
|
+
# Models
|
|
55
|
+
"Booking",
|
|
56
|
+
"BookingStatus",
|
|
57
|
+
"FlightSegment",
|
|
58
|
+
"Passenger",
|
|
59
|
+
"Ticket",
|
|
60
|
+
"TaxItem",
|
|
61
|
+
"ParseResult",
|
|
62
|
+
"DocumentType",
|
|
63
|
+
"ExtractionMethod",
|
|
64
|
+
# Exceptions
|
|
65
|
+
"ETicketSDKError",
|
|
66
|
+
"DocumentReadError",
|
|
67
|
+
"UnsupportedDocumentError",
|
|
68
|
+
"OCRFailedError",
|
|
69
|
+
"ParserError",
|
|
70
|
+
"ValidationError",
|
|
71
|
+
]
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
"""Core pipeline: parser orchestration, extraction, classification, validation."""
|
|
2
|
+
|
|
3
|
+
from .classifier import DocumentClassifier
|
|
4
|
+
from .extractor import DocumentExtractor, ExtractionResult
|
|
5
|
+
from .parser import ETicketParser
|
|
6
|
+
from .validator import DocumentValidator
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
"ETicketParser",
|
|
10
|
+
"DocumentExtractor",
|
|
11
|
+
"ExtractionResult",
|
|
12
|
+
"DocumentClassifier",
|
|
13
|
+
"DocumentValidator",
|
|
14
|
+
]
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"""Document classification: choose the right parser for extracted text."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from ..models import DocumentType
|
|
6
|
+
from ..parsers.airline.registry import ParserRegistry, get_default_registry
|
|
7
|
+
from ..parsers.base import BaseParser
|
|
8
|
+
from ..pdf import pdf_reader
|
|
9
|
+
from ..utils.logging import get_logger
|
|
10
|
+
|
|
11
|
+
_logger = get_logger("core.classifier")
|
|
12
|
+
|
|
13
|
+
# Common image extensions for type detection by filename.
|
|
14
|
+
_IMAGE_EXTS = {".png", ".jpg", ".jpeg", ".tif", ".tiff", ".bmp", ".webp"}
|
|
15
|
+
|
|
16
|
+
# Magic-byte signatures for common image formats.
|
|
17
|
+
_IMAGE_MAGIC = (
|
|
18
|
+
b"\x89PNG\r\n", # PNG
|
|
19
|
+
b"\xff\xd8\xff", # JPEG
|
|
20
|
+
b"GIF8", # GIF
|
|
21
|
+
b"BM", # BMP
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class DocumentClassifier:
|
|
26
|
+
"""Detects input type and selects the best parser for the content."""
|
|
27
|
+
|
|
28
|
+
def __init__(self, registry: ParserRegistry | None = None) -> None:
|
|
29
|
+
self._registry = registry or get_default_registry()
|
|
30
|
+
|
|
31
|
+
# -- input type detection ------------------------------------------- #
|
|
32
|
+
@staticmethod
|
|
33
|
+
def detect_type_from_bytes(data: bytes) -> DocumentType:
|
|
34
|
+
if pdf_reader.is_pdf_bytes(data):
|
|
35
|
+
return DocumentType.PDF
|
|
36
|
+
if any(data[:8].startswith(magic) for magic in _IMAGE_MAGIC):
|
|
37
|
+
return DocumentType.IMAGE
|
|
38
|
+
return DocumentType.UNKNOWN
|
|
39
|
+
|
|
40
|
+
@staticmethod
|
|
41
|
+
def detect_type_from_path(path: str) -> DocumentType:
|
|
42
|
+
lower = path.lower()
|
|
43
|
+
if lower.endswith(".pdf"):
|
|
44
|
+
return DocumentType.PDF
|
|
45
|
+
if any(lower.endswith(ext) for ext in _IMAGE_EXTS):
|
|
46
|
+
return DocumentType.IMAGE
|
|
47
|
+
if lower.endswith((".txt", ".text")):
|
|
48
|
+
return DocumentType.TEXT
|
|
49
|
+
return DocumentType.UNKNOWN
|
|
50
|
+
|
|
51
|
+
# -- parser selection ----------------------------------------------- #
|
|
52
|
+
def classify(self, text: str) -> tuple[BaseParser | None, float]:
|
|
53
|
+
"""Return the best-matching parser for ``text`` and its confidence."""
|
|
54
|
+
|
|
55
|
+
parser, score = self._registry.best_match(text)
|
|
56
|
+
if parser:
|
|
57
|
+
_logger.debug("Classified as %s (score=%.3f)", parser.name, score)
|
|
58
|
+
return parser, score
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
"""Turn a raw input source into text, choosing text-layer vs OCR.
|
|
2
|
+
|
|
3
|
+
This is the implementation of pipeline steps 1-4: detect type, extract the text
|
|
4
|
+
layer, measure quality, and fall back to OCR only when necessary.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
|
|
11
|
+
from ..exceptions import DocumentReadError, UnsupportedDocumentError
|
|
12
|
+
from ..models import DocumentType, ExtractionMethod
|
|
13
|
+
from ..ocr.fallback import OCRFallback
|
|
14
|
+
from ..pdf import text_extractor
|
|
15
|
+
from ..utils.logging import get_logger
|
|
16
|
+
from .classifier import DocumentClassifier
|
|
17
|
+
|
|
18
|
+
_logger = get_logger("core.extractor")
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class ExtractionResult:
|
|
23
|
+
"""Outcome of the text-acquisition stage."""
|
|
24
|
+
|
|
25
|
+
text: str
|
|
26
|
+
document_type: DocumentType
|
|
27
|
+
method: ExtractionMethod
|
|
28
|
+
quality: float
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class DocumentExtractor:
|
|
32
|
+
"""Acquires text from PDF / image / raw-text inputs."""
|
|
33
|
+
|
|
34
|
+
def __init__(
|
|
35
|
+
self,
|
|
36
|
+
*,
|
|
37
|
+
ocr: OCRFallback,
|
|
38
|
+
classifier: DocumentClassifier | None = None,
|
|
39
|
+
text_quality_threshold: float = 0.35,
|
|
40
|
+
min_text_length: int = 80,
|
|
41
|
+
) -> None:
|
|
42
|
+
self._ocr = ocr
|
|
43
|
+
self._classifier = classifier or DocumentClassifier()
|
|
44
|
+
self._threshold = text_quality_threshold
|
|
45
|
+
self._min_length = min_text_length
|
|
46
|
+
|
|
47
|
+
def extract_from_bytes(
|
|
48
|
+
self, data: bytes, *, document_type: DocumentType | None = None
|
|
49
|
+
) -> ExtractionResult:
|
|
50
|
+
doc_type = document_type or self._classifier.detect_type_from_bytes(data)
|
|
51
|
+
|
|
52
|
+
if doc_type == DocumentType.PDF:
|
|
53
|
+
return self._extract_pdf(data)
|
|
54
|
+
if doc_type == DocumentType.IMAGE:
|
|
55
|
+
return self._extract_image(data)
|
|
56
|
+
raise UnsupportedDocumentError(
|
|
57
|
+
f"Unsupported document type: {doc_type.value}. "
|
|
58
|
+
"Provide a PDF, an image, or use parse_text() for raw text."
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
def extract_from_text(self, text: str) -> ExtractionResult:
|
|
62
|
+
if not text or not text.strip():
|
|
63
|
+
raise DocumentReadError("Empty text supplied to parse_text().")
|
|
64
|
+
return ExtractionResult(
|
|
65
|
+
text=text,
|
|
66
|
+
document_type=DocumentType.TEXT,
|
|
67
|
+
method=ExtractionMethod.RAW_TEXT,
|
|
68
|
+
quality=text_extractor.measure_quality(text, min_length=self._min_length),
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
# -- internals ------------------------------------------------------ #
|
|
72
|
+
def _extract_pdf(self, data: bytes) -> ExtractionResult:
|
|
73
|
+
text = text_extractor.extract_text(data)
|
|
74
|
+
quality = text_extractor.measure_quality(text, min_length=self._min_length)
|
|
75
|
+
_logger.debug("PDF text-layer quality=%.3f length=%d", quality, len(text))
|
|
76
|
+
|
|
77
|
+
if quality >= self._threshold and len(text) >= self._min_length:
|
|
78
|
+
return ExtractionResult(
|
|
79
|
+
text=text,
|
|
80
|
+
document_type=DocumentType.PDF,
|
|
81
|
+
method=ExtractionMethod.TEXT_LAYER,
|
|
82
|
+
quality=quality,
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
_logger.info("Text layer insufficient (q=%.3f); attempting OCR fallback", quality)
|
|
86
|
+
ocr_text = self._ocr.run_on_pdf(data)
|
|
87
|
+
return ExtractionResult(
|
|
88
|
+
text=ocr_text,
|
|
89
|
+
document_type=DocumentType.PDF,
|
|
90
|
+
method=ExtractionMethod.OCR,
|
|
91
|
+
quality=text_extractor.measure_quality(ocr_text, min_length=self._min_length),
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
def _extract_image(self, data: bytes) -> ExtractionResult:
|
|
95
|
+
# Images have no text layer; go straight to OCR.
|
|
96
|
+
ocr_text = self._ocr.run_on_image(data)
|
|
97
|
+
return ExtractionResult(
|
|
98
|
+
text=ocr_text,
|
|
99
|
+
document_type=DocumentType.IMAGE,
|
|
100
|
+
method=ExtractionMethod.OCR,
|
|
101
|
+
quality=text_extractor.measure_quality(ocr_text, min_length=self._min_length),
|
|
102
|
+
)
|
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
"""Public SDK entry point: :class:`ETicketParser`.
|
|
2
|
+
|
|
3
|
+
Orchestrates the full pipeline (detect -> extract -> classify -> parse ->
|
|
4
|
+
validate -> return) behind a deliberately tiny API:
|
|
5
|
+
|
|
6
|
+
parser = ETicketParser()
|
|
7
|
+
booking = parser.parse("ticket.pdf")
|
|
8
|
+
print(booking.model_dump())
|
|
9
|
+
|
|
10
|
+
The parser instance is reusable and thread-safe: the OCR model, compiled regex
|
|
11
|
+
and parser registry are all created once and shared.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import logging
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
|
|
19
|
+
from ..exceptions import ParserError
|
|
20
|
+
from ..models import Booking, ParseResult
|
|
21
|
+
from ..ocr.fallback import OCRFallback
|
|
22
|
+
from ..parsers.airline.registry import ParserRegistry, get_default_registry
|
|
23
|
+
from ..utils.logging import configure_logging, get_logger
|
|
24
|
+
from .classifier import DocumentClassifier
|
|
25
|
+
from .extractor import DocumentExtractor, ExtractionResult
|
|
26
|
+
from .validator import DocumentValidator
|
|
27
|
+
|
|
28
|
+
_logger = get_logger("core.parser")
|
|
29
|
+
|
|
30
|
+
# Fields used to compute a heuristic confidence/completeness score.
|
|
31
|
+
_KEY_FIELDS = ("booking_code", "ticket_number", "currency", "total_price")
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class ETicketParser:
|
|
35
|
+
"""Parse airline e-ticket documents into strongly-typed :class:`Booking`s."""
|
|
36
|
+
|
|
37
|
+
def __init__(
|
|
38
|
+
self,
|
|
39
|
+
*,
|
|
40
|
+
enable_ocr: bool = True,
|
|
41
|
+
ocr_langs: list[str] | None = None,
|
|
42
|
+
debug: bool = False,
|
|
43
|
+
strict_validation: bool = False,
|
|
44
|
+
text_quality_threshold: float = 0.35,
|
|
45
|
+
registry: ParserRegistry | None = None,
|
|
46
|
+
) -> None:
|
|
47
|
+
if debug:
|
|
48
|
+
configure_logging(logging.DEBUG)
|
|
49
|
+
|
|
50
|
+
self.debug = debug
|
|
51
|
+
self.strict_validation = strict_validation
|
|
52
|
+
self._registry = registry or get_default_registry()
|
|
53
|
+
|
|
54
|
+
self._ocr = OCRFallback(
|
|
55
|
+
enabled=enable_ocr, languages=ocr_langs or ["vi", "en", "ja"]
|
|
56
|
+
)
|
|
57
|
+
self._classifier = DocumentClassifier(self._registry)
|
|
58
|
+
self._extractor = DocumentExtractor(
|
|
59
|
+
ocr=self._ocr,
|
|
60
|
+
classifier=self._classifier,
|
|
61
|
+
text_quality_threshold=text_quality_threshold,
|
|
62
|
+
)
|
|
63
|
+
self._validator = DocumentValidator()
|
|
64
|
+
|
|
65
|
+
# ------------------------------------------------------------------ #
|
|
66
|
+
# Public API (returns Booking)
|
|
67
|
+
# ------------------------------------------------------------------ #
|
|
68
|
+
def parse(self, source: str | Path) -> Booking:
|
|
69
|
+
"""Parse a document from a file path. Returns a :class:`Booking`."""
|
|
70
|
+
|
|
71
|
+
return self.parse_detailed(source).booking # type: ignore[return-value]
|
|
72
|
+
|
|
73
|
+
def parse_bytes(self, data: bytes) -> Booking:
|
|
74
|
+
"""Parse a document from raw bytes (PDF or image)."""
|
|
75
|
+
|
|
76
|
+
return self.parse_bytes_detailed(data).booking # type: ignore[return-value]
|
|
77
|
+
|
|
78
|
+
def parse_text(self, text: str) -> Booking:
|
|
79
|
+
"""Parse already-extracted raw text (no PDF/OCR involved)."""
|
|
80
|
+
|
|
81
|
+
return self.parse_text_detailed(text).booking # type: ignore[return-value]
|
|
82
|
+
|
|
83
|
+
# ------------------------------------------------------------------ #
|
|
84
|
+
# Public API (returns ParseResult envelope)
|
|
85
|
+
# ------------------------------------------------------------------ #
|
|
86
|
+
def parse_detailed(self, source: str | Path) -> ParseResult:
|
|
87
|
+
"""Like :meth:`parse` but returns the full :class:`ParseResult`."""
|
|
88
|
+
|
|
89
|
+
from ..pdf import pdf_reader
|
|
90
|
+
|
|
91
|
+
path = str(source)
|
|
92
|
+
data = pdf_reader.read_bytes(path)
|
|
93
|
+
doc_type = self._classifier.detect_type_from_path(path)
|
|
94
|
+
if doc_type.name == "TEXT":
|
|
95
|
+
return self.parse_text_detailed(data.decode("utf-8", errors="replace"))
|
|
96
|
+
extraction = self._extractor.extract_from_bytes(
|
|
97
|
+
data, document_type=None if doc_type.name == "UNKNOWN" else doc_type
|
|
98
|
+
)
|
|
99
|
+
return self._finish(extraction)
|
|
100
|
+
|
|
101
|
+
def parse_bytes_detailed(self, data: bytes) -> ParseResult:
|
|
102
|
+
extraction = self._extractor.extract_from_bytes(data)
|
|
103
|
+
return self._finish(extraction)
|
|
104
|
+
|
|
105
|
+
def parse_text_detailed(self, text: str) -> ParseResult:
|
|
106
|
+
extraction = self._extractor.extract_from_text(text)
|
|
107
|
+
return self._finish(extraction)
|
|
108
|
+
|
|
109
|
+
# ------------------------------------------------------------------ #
|
|
110
|
+
# Pipeline tail: classify -> parse -> validate -> envelope
|
|
111
|
+
# ------------------------------------------------------------------ #
|
|
112
|
+
def _finish(self, extraction: ExtractionResult) -> ParseResult:
|
|
113
|
+
parser, score = self._classifier.classify(extraction.text)
|
|
114
|
+
if parser is None:
|
|
115
|
+
raise ParserError(
|
|
116
|
+
"No parser could handle this document. "
|
|
117
|
+
"Register an airline parser or check the input."
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
_logger.info(
|
|
121
|
+
"Parsing with %s (match=%.3f, method=%s)",
|
|
122
|
+
parser.name,
|
|
123
|
+
score,
|
|
124
|
+
extraction.method.value,
|
|
125
|
+
)
|
|
126
|
+
try:
|
|
127
|
+
booking = parser.parse(extraction.text)
|
|
128
|
+
except ParserError:
|
|
129
|
+
raise
|
|
130
|
+
except Exception as exc: # surface parser bugs as ParserError
|
|
131
|
+
raise ParserError(
|
|
132
|
+
f"Parser {parser.name!r} failed.", details=str(exc)
|
|
133
|
+
) from exc
|
|
134
|
+
|
|
135
|
+
warnings = self._validator.validate(booking, strict=self.strict_validation)
|
|
136
|
+
confidence = _completeness(booking)
|
|
137
|
+
|
|
138
|
+
return ParseResult(
|
|
139
|
+
success=True,
|
|
140
|
+
document_type=extraction.document_type,
|
|
141
|
+
extraction_method=extraction.method,
|
|
142
|
+
parser=parser.name,
|
|
143
|
+
confidence=confidence,
|
|
144
|
+
booking=booking,
|
|
145
|
+
warnings=warnings,
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def _completeness(booking: Booking) -> float:
|
|
150
|
+
"""Fraction of key fields present, lightly weighted by segment presence."""
|
|
151
|
+
|
|
152
|
+
present = sum(1 for f in _KEY_FIELDS if getattr(booking, f, None) is not None)
|
|
153
|
+
field_score = present / len(_KEY_FIELDS)
|
|
154
|
+
segment_score = 1.0 if booking.segments else 0.0
|
|
155
|
+
return round(0.7 * field_score + 0.3 * segment_score, 4)
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
"""Business-rule validation for parsed bookings.
|
|
2
|
+
|
|
3
|
+
Pydantic validates field *types*; this layer validates cross-field *business*
|
|
4
|
+
rules (ticket number length, currency sanity, presence of segments) and either
|
|
5
|
+
raises :class:`ValidationError` (strict mode) or returns a list of warnings.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import re
|
|
11
|
+
|
|
12
|
+
from ..exceptions import ValidationError
|
|
13
|
+
from ..models import Booking
|
|
14
|
+
from ..utils.currency import is_valid_currency
|
|
15
|
+
from ..utils.logging import get_logger
|
|
16
|
+
|
|
17
|
+
_logger = get_logger("core.validator")
|
|
18
|
+
|
|
19
|
+
_TICKET_RE = re.compile(r"^\d{13}$")
|
|
20
|
+
_PNR_RE = re.compile(r"^[A-Z0-9]{6}$")
|
|
21
|
+
_FLIGHT_RE = re.compile(r"^([A-Z]{2}|[A-Z]\d)\d{1,4}$")
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class DocumentValidator:
|
|
25
|
+
"""Validates a :class:`Booking` against airline-ticket business rules."""
|
|
26
|
+
|
|
27
|
+
def validate(self, booking: Booking, *, strict: bool = False) -> list[str]:
|
|
28
|
+
"""Return a list of validation issues.
|
|
29
|
+
|
|
30
|
+
In ``strict`` mode, raises :class:`ValidationError` if any issue is
|
|
31
|
+
found; otherwise returns the issues as warnings for the caller to log.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
issues: list[str] = []
|
|
35
|
+
t = booking.ticket
|
|
36
|
+
|
|
37
|
+
if not t.booking_code:
|
|
38
|
+
issues.append("Missing booking code (PNR).")
|
|
39
|
+
elif not _PNR_RE.match(t.booking_code):
|
|
40
|
+
issues.append(f"Invalid booking code format: {t.booking_code!r}.")
|
|
41
|
+
|
|
42
|
+
if not t.ticket_number:
|
|
43
|
+
issues.append("Missing ticket number.")
|
|
44
|
+
elif not _TICKET_RE.match(t.ticket_number):
|
|
45
|
+
issues.append(f"Invalid ticket number format: {t.ticket_number!r}.")
|
|
46
|
+
|
|
47
|
+
if t.currency and not is_valid_currency(t.currency):
|
|
48
|
+
issues.append(f"Invalid currency code: {t.currency!r}.")
|
|
49
|
+
|
|
50
|
+
if t.total_price is not None and t.total_price < 0:
|
|
51
|
+
issues.append("Total price is negative.")
|
|
52
|
+
|
|
53
|
+
if not booking.segments:
|
|
54
|
+
issues.append("No flight segments found.")
|
|
55
|
+
for seg in booking.segments:
|
|
56
|
+
if seg.flight_number and not _FLIGHT_RE.match(seg.flight_number):
|
|
57
|
+
issues.append(f"Invalid flight number format: {seg.flight_number!r}.")
|
|
58
|
+
|
|
59
|
+
if strict and issues:
|
|
60
|
+
raise ValidationError("Booking failed validation.", errors=issues)
|
|
61
|
+
|
|
62
|
+
for issue in issues:
|
|
63
|
+
_logger.warning("Validation: %s", issue)
|
|
64
|
+
return issues
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
"""Airport name -> IATA resolution.
|
|
2
|
+
|
|
3
|
+
E-ticket receipts frequently print full airport names ("TOKYO NARITA INTL, JP")
|
|
4
|
+
rather than IATA codes. This module resolves those names to IATA codes using a
|
|
5
|
+
keyword-matching strategy that is resilient to ordering and punctuation.
|
|
6
|
+
|
|
7
|
+
The table is intentionally a curated subset of high-traffic airports plus every
|
|
8
|
+
airport required by the bundled test fixtures. It is data, not logic — extend it
|
|
9
|
+
freely without touching the parsers.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
# IATA code -> tuple of distinguishing keyword sets. A name matches a code when
|
|
15
|
+
# *all* keywords in any one set are present (case-insensitive) in the name.
|
|
16
|
+
_AIRPORT_KEYWORDS: dict[str, tuple[tuple[str, ...], ...]] = {
|
|
17
|
+
"NRT": (("tokyo", "narita"), ("narita",)),
|
|
18
|
+
"HND": (("tokyo", "haneda"), ("haneda",)),
|
|
19
|
+
"HAN": (("hanoi", "noi bai"), ("noi bai",)),
|
|
20
|
+
"SGN": (("ho chi minh", "tan son nhat"), ("tan son nhat",), ("saigon",)),
|
|
21
|
+
"DAD": (("da nang",), ("danang",)),
|
|
22
|
+
"CDG": (("paris", "charles de gaulle"), ("charles de gaulle",)),
|
|
23
|
+
"ORY": (("paris", "orly"),),
|
|
24
|
+
"LHR": (("london", "heathrow"), ("heathrow",)),
|
|
25
|
+
"LGW": (("london", "gatwick"),),
|
|
26
|
+
"FRA": (("frankfurt",),),
|
|
27
|
+
"AMS": (("amsterdam", "schiphol"), ("schiphol",)),
|
|
28
|
+
"ICN": (("seoul", "incheon"), ("incheon",)),
|
|
29
|
+
"GMP": (("seoul", "gimpo"),),
|
|
30
|
+
"PEK": (("beijing", "capital"),),
|
|
31
|
+
"PKX": (("beijing", "daxing"),),
|
|
32
|
+
"PVG": (("shanghai", "pudong"), ("pudong",)),
|
|
33
|
+
"HKG": (("hong kong",),),
|
|
34
|
+
"TPE": (("taipei", "taoyuan"), ("taoyuan",)),
|
|
35
|
+
"SIN": (("singapore", "changi"), ("changi",)),
|
|
36
|
+
"BKK": (("bangkok", "suvarnabhumi"), ("suvarnabhumi",)),
|
|
37
|
+
"DMK": (("bangkok", "don muang"),),
|
|
38
|
+
"KUL": (("kuala lumpur",),),
|
|
39
|
+
"SYD": (("sydney",),),
|
|
40
|
+
"MEL": (("melbourne",),),
|
|
41
|
+
"LAX": (("los angeles",),),
|
|
42
|
+
"SFO": (("san francisco",),),
|
|
43
|
+
"JFK": (("new york", "kennedy"), ("john f kennedy",)),
|
|
44
|
+
"DXB": (("dubai",),),
|
|
45
|
+
"DOH": (("doha", "hamad"),),
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def resolve_airport(name: str | None) -> str | None:
|
|
50
|
+
"""Resolve a printed airport name to its IATA code.
|
|
51
|
+
|
|
52
|
+
Returns the 3-letter IATA code, or ``None`` when no confident match exists
|
|
53
|
+
(callers then fall back to the raw name).
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
if not name:
|
|
57
|
+
return None
|
|
58
|
+
|
|
59
|
+
text = " ".join(name.lower().replace(",", " ").split())
|
|
60
|
+
|
|
61
|
+
# Fast path: an explicit 3-letter IATA token already present in the name.
|
|
62
|
+
for token in text.upper().split():
|
|
63
|
+
if len(token) == 3 and token.isalpha() and token in _AIRPORT_KEYWORDS:
|
|
64
|
+
return token
|
|
65
|
+
|
|
66
|
+
for code, keyword_sets in _AIRPORT_KEYWORDS.items():
|
|
67
|
+
for keywords in keyword_sets:
|
|
68
|
+
if all(kw in text for kw in keywords):
|
|
69
|
+
return code
|
|
70
|
+
return None
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""Public exception surface for the SDK."""
|
|
2
|
+
|
|
3
|
+
from .parser_error import (
|
|
4
|
+
DocumentReadError,
|
|
5
|
+
ETicketSDKError,
|
|
6
|
+
OCRFailedError,
|
|
7
|
+
ParserError,
|
|
8
|
+
UnsupportedDocumentError,
|
|
9
|
+
)
|
|
10
|
+
from .validation_error import ValidationError
|
|
11
|
+
|
|
12
|
+
__all__ = [
|
|
13
|
+
"ETicketSDKError",
|
|
14
|
+
"DocumentReadError",
|
|
15
|
+
"UnsupportedDocumentError",
|
|
16
|
+
"OCRFailedError",
|
|
17
|
+
"ParserError",
|
|
18
|
+
"ValidationError",
|
|
19
|
+
]
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
"""Custom exceptions for the document/parsing pipeline.
|
|
2
|
+
|
|
3
|
+
All SDK exceptions derive from :class:`ETicketSDKError` so callers can catch the
|
|
4
|
+
whole family with a single ``except`` clause while still being able to handle
|
|
5
|
+
specific failure modes.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ETicketSDKError(Exception):
|
|
14
|
+
"""Base class for every exception raised by the SDK."""
|
|
15
|
+
|
|
16
|
+
def __init__(self, message: str, *, details: Any | None = None) -> None:
|
|
17
|
+
super().__init__(message)
|
|
18
|
+
self.message = message
|
|
19
|
+
self.details = details
|
|
20
|
+
|
|
21
|
+
def __str__(self) -> str: # pragma: no cover - trivial
|
|
22
|
+
if self.details:
|
|
23
|
+
return f"{self.message} ({self.details})"
|
|
24
|
+
return self.message
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class DocumentReadError(ETicketSDKError):
|
|
28
|
+
"""Raised when a document cannot be opened or read (corrupt/empty file)."""
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class UnsupportedDocumentError(ETicketSDKError):
|
|
32
|
+
"""Raised when the input type or document layout is not supported."""
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class OCRFailedError(ETicketSDKError):
|
|
36
|
+
"""Raised when OCR fallback is required but unavailable or fails."""
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class ParserError(ETicketSDKError):
|
|
40
|
+
"""Raised when a parser cannot extract the minimum required fields."""
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
"""Validation-related exceptions."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from .parser_error import ETicketSDKError
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class ValidationError(ETicketSDKError):
|
|
11
|
+
"""Raised when extracted data fails business-rule validation.
|
|
12
|
+
|
|
13
|
+
``errors`` carries a list of human-readable field-level problems so the
|
|
14
|
+
caller can surface every issue at once instead of one at a time.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
def __init__(
|
|
18
|
+
self,
|
|
19
|
+
message: str,
|
|
20
|
+
*,
|
|
21
|
+
errors: list[str] | None = None,
|
|
22
|
+
details: Any | None = None,
|
|
23
|
+
) -> None:
|
|
24
|
+
super().__init__(message, details=details)
|
|
25
|
+
self.errors: list[str] = errors or []
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""Domain models (Pydantic v2)."""
|
|
2
|
+
|
|
3
|
+
from .booking import Booking, BookingStatus
|
|
4
|
+
from .flight import FlightSegment
|
|
5
|
+
from .passenger import Passenger
|
|
6
|
+
from .response import DocumentType, ExtractionMethod, ParseResult
|
|
7
|
+
from .ticket import TaxItem, Ticket
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"Passenger",
|
|
11
|
+
"FlightSegment",
|
|
12
|
+
"Ticket",
|
|
13
|
+
"TaxItem",
|
|
14
|
+
"Booking",
|
|
15
|
+
"BookingStatus",
|
|
16
|
+
"ParseResult",
|
|
17
|
+
"DocumentType",
|
|
18
|
+
"ExtractionMethod",
|
|
19
|
+
]
|