PyPI - docid - Versions diffs - 0.1.3__tar.gz → 0.1.4__tar.gz - Mend

docid 0.1.3tar.gz → 0.1.4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

{docid-0.1.3 → docid-0.1.4}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: docid
-Version: 0.1.3
+Version: 0.1.4
 Summary: Deterministyczny generator identyfikatorów dokumentów z OCR
 Home-page: https://github.com/softreck/docid
 Author: Softreck

{docid-0.1.3 → docid-0.1.4}/docid.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: docid
-Version: 0.1.3
+Version: 0.1.4
 Summary: Deterministyczny generator identyfikatorów dokumentów z OCR
 Home-page: https://github.com/softreck/docid
 Author: Softreck

{docid-0.1.3 → docid-0.1.4}/docid.egg-info/SOURCES.txt RENAMED Viewed

@@ -18,4 +18,5 @@ exef_docid/pipeline.py
 exef_docid/extractors/__init__.py
 exef_docid/extractors/base.py
 tests/test_document_id.py
-tests/test_extractors.py
+tests/test_extractors.py
+tests/test_samples_id.py

{docid-0.1.3 → docid-0.1.4}/exef_docid/__init__.py RENAMED Viewed

@@ -1,5 +1,5 @@
 """
-EXEF Document ID Generator
+DOC Document ID Generator
 Deterministyczny generator identyfikatorów dokumentów z OCR.
 Generuje zawsze ten sam ID dla tego samego dokumentu,
@@ -10,14 +10,14 @@ Przykład użycia:
     # Pełne przetwarzanie
     result = process_document("faktura.pdf")
-    print(result.document_id)      # EXEF-FV-A7B3C9D2E1F04856
+    print(result.document_id)      # DOC-FV-A7B3C9D2E1F04856
     print(result.extraction.issuer_nip)  # 5213017228
     # Tylko ID
     doc_id = get_document_id("paragon.jpg")
     # Weryfikacja
-    is_same = verify_document_id("skan.png", "EXEF-FV-A7B3C9D2E1F04856")
+    is_same = verify_document_id("skan.png", "DOC-FV-A7B3C9D2E1F04856")
 Wymagania:
     pip install paddleocr paddlepaddle pdf2image pillow

{docid-0.1.3 → docid-0.1.4}/exef_docid/cli.py RENAMED Viewed

@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 """
-CLI dla EXEF Document ID Generator.
+CLI dla DOC Document ID Generator.
 Użycie:
     # Przetwórz pojedynczy plik
@@ -13,7 +13,7 @@ Użycie:
     docid batch ./dokumenty/ --output results.json
     # Weryfikacja ID
-    docid verify faktura.pdf EXEF-FV-A7B3C9D2E1F04856
+    docid verify faktura.pdf DOC-FV-A7B3C9D2E1F04856
     # Tylko OCR (bez generowania ID)
     docid ocr skan.jpg
@@ -267,7 +267,7 @@ def cmd_generate_id(args):
 def main():
     parser = argparse.ArgumentParser(
-        description='EXEF Document ID Generator - deterministyczne ID dokumentów z OCR',
+        description='DOC Document ID Generator - deterministyczne ID dokumentów z OCR',
         formatter_class=argparse.RawDescriptionHelpFormatter,
     )
     parser.add_argument('--version', action='version', version='docid 0.1.0')
@@ -279,7 +279,7 @@ def main():
     common.add_argument('--engine', choices=['paddle', 'tesseract'], default='paddle',
                        help='Silnik OCR (domyślnie: paddle)')
     common.add_argument('--lang', default='pl', help='Język dokumentów')
-    common.add_argument('--prefix', default='EXEF', help='Prefiks ID')
+    common.add_argument('--prefix', default='DOC', help='Prefiks ID')
     common.add_argument('--gpu', action='store_true', help='Użyj GPU')
     common.add_argument('-v', '--verbose', action='store_true', help='Więcej szczegółów')
@@ -324,7 +324,7 @@ def main():
     p_gen.add_argument('--number', help='Numer dokumentu')
     p_gen.add_argument('--date', help='Data (YYYY-MM-DD)')
     p_gen.add_argument('--amount', help='Kwota brutto')
-    p_gen.add_argument('--prefix', default='EXEF', help='Prefiks ID')
+    p_gen.add_argument('--prefix', default='DOC', help='Prefiks ID')
     p_gen.set_defaults(func=cmd_generate_id)
     args = parser.parse_args()

{docid-0.1.3 → docid-0.1.4}/exef_docid/cli_universal.py RENAMED Viewed

@@ -431,7 +431,7 @@ def main():
     """Main CLI entry point"""
     parser = argparse.ArgumentParser(
         prog='docid',
-        description='EXEF Document ID Generator - CLI'
+        description='DOC Document ID Generator - CLI'
     )
     parser.add_argument('--version', action='version', version='%(prog)s 0.1.0')

{docid-0.1.3 → docid-0.1.4}/exef_docid/document_id.py RENAMED Viewed

@@ -14,8 +14,8 @@ from decimal import ROUND_HALF_UP, Decimal
 from enum import Enum
 from typing import Optional, Union
-# Namespace UUID dla EXEF (RFC 4122 UUID v5)
-EXEF_NAMESPACE = uuid.UUID('a1b2c3d4-e5f6-7890-abcd-ef1234567890')
+# Namespace UUID dla DOC (RFC 4122 UUID v5)
+DOC_NAMESPACE = uuid.UUID('a1b2c3d4-e5f6-7890-abcd-ef1234567890')
 class DocumentType(Enum):
@@ -207,10 +207,10 @@ class DocumentIDGenerator:
     niezależnie od formatu źródłowego dokumentu.
     """
-    def __init__(self, prefix: str = "EXEF"):
+    def __init__(self, prefix: str = "DOC"):
         """
         Args:
-            prefix: Prefiks identyfikatora (domyślnie EXEF)
+            prefix: Prefiks identyfikatora (domyślnie DOC)
         """
         self.prefix = prefix
@@ -229,7 +229,7 @@ class DocumentIDGenerator:
         >>> gen = DocumentIDGenerator()
         >>> gen.generate_invoice_id("5213017228", "FV/2025/00142", "2025-01-15", 1230.00)
-        'EXEF-FV-A7B3C9D2E1F04856'
+        'DOC-FV-A7B3C9D2E1F04856'
         """
         canonical = CanonicalData(
             document_type=DocumentType.INVOICE,
@@ -265,7 +265,7 @@ class DocumentIDGenerator:
         >>> gen = DocumentIDGenerator()
         >>> gen.generate_receipt_id("5213017228", "2025-01-15", 45.99)
-        'EXEF-PAR-...'
+        'DOC-PAR-...'
         """
         parts = [
             NIPValidator.normalize(seller_nip),
@@ -649,7 +649,7 @@ class DocumentIDGenerator:
         Generuje finalny identyfikator z danych kanonicznych.
         Format: {PREFIX}-{TYPE}-{HASH16}
-        Przykład: EXEF-FV-A7B3C9D2E1F04856
+        Przykład: DOC-FV-A7B3C9D2E1F04856
         """
         # SHA256 z canonical string
         hash_bytes = hashlib.sha256(canonical.canonical_string.encode('utf-8')).digest()
@@ -662,7 +662,7 @@ class DocumentIDGenerator:
         Weryfikuje czy ID odpowiada danym kanonicznym.
         >>> gen = DocumentIDGenerator()
-        >>> gen.verify_id("EXEF-FV-A7B3C9D2E1F04856", "5213017228|FV/2025/00142|2025-01-15|1230.00")
+        >>> gen.verify_id("DOC-FV-A7B3C9D2E1F04856", "5213017228|FV/2025/00142|2025-01-15|1230.00")
         True
         """
         hash_bytes = hashlib.sha256(canonical_string.encode('utf-8')).digest()
@@ -679,8 +679,8 @@ class DocumentIDGenerator:
         """
         Parsuje identyfikator dokumentu.
-        >>> DocumentIDGenerator.parse_id("EXEF-FV-A7B3C9D2E1F04856")
-        {'prefix': 'EXEF', 'type': 'FV', 'hash': 'A7B3C9D2E1F04856',
+        >>> DocumentIDGenerator.parse_id("DOC-FV-A7B3C9D2E1F04856")
+        {'prefix': 'DOC', 'type': 'FV', 'hash': 'A7B3C9D2E1F04856',
          'document_type': <DocumentType.INVOICE>}
         """
         parts = document_id.split('-')

{docid-0.1.3 → docid-0.1.4}/exef_docid/ocr_processor.py RENAMED Viewed

@@ -195,12 +195,8 @@ class PaddleOCRProcessor(BaseOCRProcessor):
                 self._ocr = PaddleOCR(
                     use_angle_cls=True,
                     lang=lang,
-                    use_gpu=self.use_gpu,
                     det_model_dir=self._det_model_dir,
                     rec_model_dir=self._rec_model_dir,
-                    # Optymalizacje CPU
-                    enable_mkldnn=True,
-                    cpu_threads=4,
                 )
             except ImportError:
                 raise ImportError(
@@ -216,7 +212,7 @@ class PaddleOCRProcessor(BaseOCRProcessor):
         self._init_ocr()
         image_path = str(image_path)
-        result = self._ocr.ocr(image_path, cls=True)
+        result = self._ocr.ocr(image_path)
         lines = []
         full_text_parts = []
@@ -439,8 +435,8 @@ class OCRProcessor:
     def __init__(
         self,
-        preferred_engine: OCREngine = OCREngine.PADDLE,
-        fallback_engine: OCREngine = OCREngine.TESSERACT,
+        preferred_engine: OCREngine = OCREngine.TESSERACT,
+        fallback_engine: OCREngine = OCREngine.PADDLE,
         lang: str = 'pl',
         use_gpu: bool = False,
     ):

{docid-0.1.3 → docid-0.1.4}/exef_docid/pipeline.py RENAMED Viewed

@@ -97,20 +97,20 @@ class DocumentPipeline:
     Przykład użycia:
         pipeline = DocumentPipeline()
         result = pipeline.process("faktura.pdf")
-        print(result.document_id)  # EXEF-FV-A7B3C9D2E1F04856
+        print(result.document_id)  # DOC-FV-A7B3C9D2E1F04856
     """
     def __init__(
         self,
-        ocr_engine: OCREngine = OCREngine.PADDLE,
-        id_prefix: str = "EXEF",
+        ocr_engine: OCREngine = OCREngine.TESSERACT,
+        id_prefix: str = "DOC",
         lang: str = "pl",
         use_gpu: bool = False,
     ):
         """
         Args:
             ocr_engine: Silnik OCR (PADDLE lub TESSERACT)
-            id_prefix: Prefiks identyfikatorów (domyślnie EXEF)
+            id_prefix: Prefiks identyfikatorów (domyślnie DOC)
             lang: Język dokumentów (pl, en)
             use_gpu: Czy używać GPU (domyślnie False dla CPU)
         """
@@ -390,7 +390,7 @@ class DocumentPipeline:
 _default_pipeline: Optional[DocumentPipeline] = None
-def get_pipeline(ocr_engine: OCREngine = OCREngine.PADDLE) -> DocumentPipeline:
+def get_pipeline(ocr_engine: OCREngine = OCREngine.TESSERACT) -> DocumentPipeline:
     """Zwraca domyślny pipeline (lazy init)."""
     global _default_pipeline
     if _default_pipeline is None or _default_pipeline.ocr.preferred_engine != ocr_engine:
@@ -398,7 +398,7 @@ def get_pipeline(ocr_engine: OCREngine = OCREngine.PADDLE) -> DocumentPipeline:
     return _default_pipeline
-def process_document(file_path: Union[str, Path], ocr_engine: OCREngine = OCREngine.PADDLE, use_ocr: bool = True) -> ProcessedDocument:
+def process_document(file_path: Union[str, Path], ocr_engine: OCREngine = OCREngine.TESSERACT, use_ocr: bool = True) -> ProcessedDocument:
     """
     Przetwarza dokument i zwraca wynik z ID.
@@ -416,7 +416,7 @@ def get_document_id(file_path: Union[str, Path]) -> str:
     Przykład:
         doc_id = get_document_id("faktura.pdf")
-        print(doc_id)  # EXEF-FV-A7B3C9D2E1F04856
+        print(doc_id)  # DOC-FV-A7B3C9D2E1F04856
     """
     return get_pipeline().process(file_path).document_id
@@ -426,6 +426,6 @@ def verify_document_id(file_path: Union[str, Path], expected_id: str) -> bool:
     Weryfikuje czy dokument ma oczekiwany ID.
     Przykład:
-        is_valid = verify_document_id("skan.jpg", "EXEF-FV-A7B3C9D2E1F04856")
+        is_valid = verify_document_id("skan.jpg", "DOC-FV-A7B3C9D2E1F04856")
     """
     return get_pipeline().verify_document(file_path, expected_id)

{docid-0.1.3 → docid-0.1.4}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "docid"
-version = "0.1.3"
+version = "0.1.4"
 description = "Deterministyczny generator identyfikatorów dokumentów z OCR"
 readme = "README.md"
 license = {text = "MIT"}

docid-0.1.4/tests/test_samples_id.py ADDED Viewed

@@ -0,0 +1,277 @@
+"""
+Testy ID dla wszystkich próbek (samples).
+Sprawdza generowanie ID dla wszystkich plików w katalogu samples,
+w tym spójność ID między różnymi formatami tego samego dokumentu.
+"""
+import os
+from pathlib import Path
+import pytest
+from exef_docid.document_id import DocumentIDGenerator, DocumentType
+from exef_docid.pipeline import DocumentPipeline, process_document
+from exef_docid.ocr_processor import OCREngine
+SAMPLES_DIR = Path(__file__).parent.parent / "samples"
+class TestSampleIDGeneration:
+    """Testy generowania ID dla wszystkich próbek."""
+    @pytest.fixture
+    def pipeline(self):
+        """Fixture tworzący pipeline z Tesseract OCR."""
+        return DocumentPipeline(ocr_engine=OCREngine.TESSERACT)
+    def get_sample_files(self, subdirectory: str) -> list[Path]:
+        """Zwraca listę plików próbek z danego podkatalogu."""
+        sample_dir = SAMPLES_DIR / subdirectory
+        if not sample_dir.exists():
+            return []
+        files = []
+        for ext in ['*.pdf', '*.jpg', '*.jpeg', '*.png', '*.xml', '*.html', '*.htm', '*.txt']:
+            files.extend(sample_dir.glob(ext))
+        return sorted(files)
+    def test_invoice_samples_generate_id(self, pipeline):
+        """Test generowania ID dla wszystkich próbek faktur."""
+        invoice_files = self.get_sample_files("invoices")
+        assert len(invoice_files) > 0, "Brak plików faktur w samples/invoices"
+        results = []
+        for file_path in invoice_files:
+            try:
+                result = pipeline.process(file_path)
+                results.append({
+                    'file': file_path.name,
+                    'id': result.document_id,
+                    'type': result.document_type.value,
+                    'canonical': result.canonical_string,
+                    'confidence': result.ocr_confidence,
+                })
+                # Sprawdź czy ID zostało wygenerowane
+                assert result.document_id, f"Brak ID dla {file_path}"
+                assert result.document_id.startswith("EXEF-FV"), f"Nieprawidłowy prefix ID dla faktury: {result.document_id}"
+            except Exception as e:
+                pytest.fail(f"Błąd przetwarzania {file_path}: {e}")
+        # Wypisz podsumowanie
+        print(f"\n=== Faktury ({len(results)} plików) ===")
+        for r in results:
+            print(f"  {r['file']}: {r['id']} (confidence: {r['confidence']:.2f})")
+    def test_receipt_samples_generate_id(self, pipeline):
+        """Test generowania ID dla wszystkich próbek paragonów."""
+        receipt_files = self.get_sample_files("receipts")
+        assert len(receipt_files) > 0, "Brak plików paragonów w samples/receipts"
+        results = []
+        for file_path in receipt_files:
+            try:
+                result = pipeline.process(file_path)
+                results.append({
+                    'file': file_path.name,
+                    'id': result.document_id,
+                    'type': result.document_type.value,
+                    'confidence': result.ocr_confidence,
+                })
+                assert result.document_id, f"Brak ID dla {file_path}"
+            except Exception as e:
+                pytest.fail(f"Błąd przetwarzania {file_path}: {e}")
+        print(f"\n=== Paragony ({len(results)} plików) ===")
+        for r in results:
+            print(f"  {r['file']}: {r['id']} (confidence: {r['confidence']:.2f})")
+    def test_contract_samples_generate_id(self, pipeline):
+        """Test generowania ID dla wszystkich próbek umów."""
+        contract_files = self.get_sample_files("contracts")
+        assert len(contract_files) > 0, "Brak plików umów w samples/contracts"
+        results = []
+        for file_path in contract_files:
+            try:
+                result = pipeline.process(file_path)
+                results.append({
+                    'file': file_path.name,
+                    'id': result.document_id,
+                    'type': result.document_type.value,
+                    'confidence': result.ocr_confidence,
+                })
+                assert result.document_id, f"Brak ID dla {file_path}"
+            except Exception as e:
+                pytest.fail(f"Błąd przetwarzania {file_path}: {e}")
+        print(f"\n=== Umowy ({len(results)} plików) ===")
+        for r in results:
+            print(f"  {r['file']}: {r['id']} (confidence: {r['confidence']:.2f})")
+    def test_invoice_cross_format_consistency(self, pipeline):
+        """Test czy różne formaty tej samej faktury mają ten sam ID."""
+        invoice_dir = SAMPLES_DIR / "invoices"
+        # Znajdź wszystkie formaty faktury (faktura_full.*)
+        base_name = "faktura_full"
+        formats = []
+        for ext in ['.jpg', '.jpeg', '.png', '.pdf', '.xml', '.html', '.txt']:
+            file_path = invoice_dir / f"{base_name}{ext}"
+            if file_path.exists():
+                formats.append(file_path)
+        if len(formats) < 2:
+            pytest.skip(f"Za mało formatów faktury do testu (znaleziono: {len(formats)})")
+        # Przetwórz wszystkie formaty
+        ids = {}
+        for file_path in formats:
+            try:
+                result = pipeline.process(file_path)
+                ids[file_path.suffix] = {
+                    'id': result.document_id,
+                    'canonical': result.canonical_string,
+                    'confidence': result.ocr_confidence,
+                }
+            except Exception as e:
+                pytest.fail(f"Błąd przetwarzania {file_path}: {e}")
+        # Sprawdź czy wszystkie formaty mają ten sam ID
+        unique_ids = set(r['id'] for r in ids.values())
+        print(f"\n=== Cross-format consistency dla {base_name} ===")
+        for ext, data in sorted(ids.items()):
+            print(f"  {ext}: {data['id']} (confidence: {data['confidence']:.2f})")
+        assert len(unique_ids) == 1, (
+            f"Różne formaty {base_name} mają różne ID: {unique_ids}\n"
+            f"Szczegóły: {ids}"
+        )
+    def test_receipt_cross_format_consistency(self, pipeline):
+        """Test czy różne formaty tego samego paragonu mają ten sam ID."""
+        receipt_dir = SAMPLES_DIR / "receipts"
+        base_name = "paragon_full"
+        formats = []
+        for ext in ['.jpg', '.jpeg', '.png', '.pdf', '.xml', '.html', '.txt']:
+            file_path = receipt_dir / f"{base_name}{ext}"
+            if file_path.exists():
+                formats.append(file_path)
+        if len(formats) < 2:
+            pytest.skip(f"Za mało formatów paragonu do testu (znaleziono: {len(formats)})")
+        ids = {}
+        for file_path in formats:
+            try:
+                result = pipeline.process(file_path)
+                ids[file_path.suffix] = {
+                    'id': result.document_id,
+                    'canonical': result.canonical_string,
+                    'confidence': result.ocr_confidence,
+                }
+            except Exception as e:
+                pytest.fail(f"Błąd przetwarzania {file_path}: {e}")
+        unique_ids = set(r['id'] for r in ids.values())
+        print(f"\n=== Cross-format consistency dla {base_name} ===")
+        for ext, data in sorted(ids.items()):
+            print(f"  {ext}: {data['id']} (confidence: {data['confidence']:.2f})")
+        assert len(unique_ids) == 1, (
+            f"Różne formaty {base_name} mają różne ID: {unique_ids}\n"
+            f"Szczegóły: {ids}"
+        )
+    def test_all_samples_summary(self, pipeline):
+        """Podsumowanie wszystkich próbek."""
+        all_results = []
+        for subdir in ['invoices', 'receipts', 'contracts', 'universal']:
+            files = self.get_sample_files(subdir)
+            for file_path in files:
+                try:
+                    result = pipeline.process(file_path)
+                    all_results.append({
+                        'subdir': subdir,
+                        'file': file_path.name,
+                        'id': result.document_id,
+                        'type': result.document_type.value,
+                        'confidence': result.ocr_confidence,
+                    })
+                except Exception as e:
+                    all_results.append({
+                        'subdir': subdir,
+                        'file': file_path.name,
+                        'error': str(e),
+                    })
+        print(f"\n{'='*60}")
+        print(f"PODSUMOWANIE WSZYSTKICH PRÓBEK ({len(all_results)} plików)")
+        print(f"{'='*60}")
+        success_count = sum(1 for r in all_results if 'error' not in r)
+        error_count = sum(1 for r in all_results if 'error' in r)
+        for r in sorted(all_results, key=lambda x: (x['subdir'], x['file'])):
+            if 'error' in r:
+                print(f"  [ERR] {r['subdir']}/{r['file']}: {r['error']}")
+            else:
+                print(f"  [OK]  {r['subdir']}/{r['file']}: {r['id']}")
+        print(f"{'='*60}")
+        print(f"Sukcesy: {success_count}/{len(all_results)}, Błędy: {error_count}/{len(all_results)}")
+        print(f"{'='*60}")
+        # Nie failujemy testu przy błędach - to tylko podsumowanie
+        assert success_count > 0, "Żaden plik nie został przetworzony pomyślnie"
+class TestSampleIDDeterminism:
+    """Testy determinizmu ID - te same dane = ten sam ID."""
+    def test_invoice_deterministic_id(self):
+        """Test czy faktura generuje ten sam ID przy każdym uruchomieniu."""
+        invoice_dir = SAMPLES_DIR / "invoices"
+        # Znajdź pierwszy dostępny plik faktury
+        invoice_file = None
+        for ext in ['.txt', '.xml', '.html']:
+            candidate = invoice_dir / f"faktura_full{ext}"
+            if candidate.exists():
+                invoice_file = candidate
+                break
+        if not invoice_file:
+            pytest.skip("Brak pliku faktury do testu determinizmu")
+        # Generuj ID dwa razy
+        id1 = process_document(invoice_file).document_id
+        id2 = process_document(invoice_file).document_id
+        assert id1 == id2, f"ID nie jest deterministyczne: {id1} != {id2}"
+    def test_receipt_deterministic_id(self):
+        """Test czy paragon generuje ten sam ID przy każdym uruchomieniu."""
+        receipt_dir = SAMPLES_DIR / "receipts"
+        receipt_file = None
+        for ext in ['.txt', '.xml', '.html']:
+            candidate = receipt_dir / f"paragon_full{ext}"
+            if candidate.exists():
+                receipt_file = candidate
+                break
+        if not receipt_file:
+            pytest.skip("Brak pliku paragonu do testu determinizmu")
+        id1 = process_document(receipt_file).document_id
+        id2 = process_document(receipt_file).document_id
+        assert id1 == id2, f"ID nie jest deterministyczne: {id1} != {id2}"