kreuzberg 3.4.2__py3-none-any.whl → 3.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
kreuzberg/extraction.py CHANGED
@@ -1,12 +1,14 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  from pathlib import Path
4
- from typing import TYPE_CHECKING, Final, cast
4
+ from typing import TYPE_CHECKING, Any, Final, cast
5
5
 
6
6
  import anyio
7
7
 
8
8
  from kreuzberg import ExtractionResult
9
9
  from kreuzberg._chunker import get_chunker
10
+ from kreuzberg._entity_extraction import extract_entities, extract_keywords
11
+ from kreuzberg._language_detection import detect_languages
10
12
  from kreuzberg._mime_types import (
11
13
  validate_mime_type,
12
14
  )
@@ -24,10 +26,7 @@ if TYPE_CHECKING:
24
26
  DEFAULT_CONFIG: Final[ExtractionConfig] = ExtractionConfig()
25
27
 
26
28
 
27
- async def _validate_and_post_process_async(result: ExtractionResult, config: ExtractionConfig) -> ExtractionResult:
28
- for validator in config.validators or []:
29
- await run_maybe_sync(validator, result)
30
-
29
+ def _validate_and_post_process_helper(result: ExtractionResult, config: ExtractionConfig) -> ExtractionResult:
31
30
  if config.chunk_content:
32
31
  result.chunks = _handle_chunk_content(
33
32
  mime_type=result.mime_type,
@@ -35,6 +34,39 @@ async def _validate_and_post_process_async(result: ExtractionResult, config: Ext
35
34
  content=result.content,
36
35
  )
37
36
 
37
+ if config.extract_entities:
38
+ try:
39
+ result.entities = extract_entities(
40
+ result.content,
41
+ custom_patterns=config.custom_entity_patterns,
42
+ )
43
+ except RuntimeError:
44
+ result.entities = None
45
+
46
+ if config.extract_keywords:
47
+ try:
48
+ result.keywords = extract_keywords(
49
+ result.content,
50
+ keyword_count=config.keyword_count,
51
+ )
52
+ except RuntimeError:
53
+ result.keywords = None
54
+
55
+ if config.auto_detect_language:
56
+ result.detected_languages = detect_languages(
57
+ result.content,
58
+ config=config.language_detection_config,
59
+ )
60
+
61
+ return result
62
+
63
+
64
+ async def _validate_and_post_process_async(result: ExtractionResult, config: ExtractionConfig) -> ExtractionResult:
65
+ for validator in config.validators or []:
66
+ await run_maybe_sync(validator, result)
67
+
68
+ result = _validate_and_post_process_helper(result, config)
69
+
38
70
  for post_processor in config.post_processing_hooks or []:
39
71
  result = await run_maybe_sync(post_processor, result)
40
72
 
@@ -45,12 +77,7 @@ def _validate_and_post_process_sync(result: ExtractionResult, config: Extraction
45
77
  for validator in config.validators or []:
46
78
  run_sync_only(validator, result)
47
79
 
48
- if config.chunk_content:
49
- result.chunks = _handle_chunk_content(
50
- mime_type=result.mime_type,
51
- config=config,
52
- content=result.content,
53
- )
80
+ result = _validate_and_post_process_helper(result, config)
54
81
 
55
82
  for post_processor in config.post_processing_hooks or []:
56
83
  result = run_sync_only(post_processor, result)
@@ -62,7 +89,7 @@ def _handle_chunk_content(
62
89
  mime_type: str,
63
90
  config: ExtractionConfig,
64
91
  content: str,
65
- ) -> list[str]:
92
+ ) -> Any:
66
93
  chunker = get_chunker(mime_type=mime_type, max_characters=config.max_chars, overlap_characters=config.max_overlap)
67
94
  return chunker.chunks(content)
68
95
 
@@ -1,12 +1,12 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kreuzberg
3
- Version: 3.4.2
3
+ Version: 3.6.0
4
4
  Summary: A text extraction library supporting PDFs, images, office documents and more
5
5
  Project-URL: homepage, https://github.com/Goldziher/kreuzberg
6
6
  Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
7
7
  License: MIT
8
8
  License-File: LICENSE
9
- Keywords: document-processing,image-to-text,ocr,pandoc,pdf-extraction,rag,table-extraction,tesseract,text-extraction,text-processing
9
+ Keywords: document-processing,entity-extraction,image-to-text,keyword-extraction,named-entity-recognition,ner,ocr,pandoc,pdf-extraction,rag,spacy,table-extraction,tesseract,text-extraction,text-processing
10
10
  Classifier: Development Status :: 5 - Production/Stable
11
11
  Classifier: Intended Audience :: Developers
12
12
  Classifier: License :: OSI Approved :: MIT License
@@ -36,16 +36,19 @@ Requires-Dist: typing-extensions>=4.14.0; python_version < '3.12'
36
36
  Provides-Extra: all
37
37
  Requires-Dist: click>=8.2.1; extra == 'all'
38
38
  Requires-Dist: easyocr>=1.7.2; extra == 'all'
39
+ Requires-Dist: fast-langdetect>=0.3.2; extra == 'all'
39
40
  Requires-Dist: gmft>=0.4.2; extra == 'all'
40
- Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.1.6; extra == 'all'
41
+ Requires-Dist: keybert>=0.9.0; extra == 'all'
42
+ Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.16.0; extra == 'all'
41
43
  Requires-Dist: paddleocr>=3.1.0; extra == 'all'
42
44
  Requires-Dist: paddlepaddle>=3.1.0; extra == 'all'
43
45
  Requires-Dist: rich>=14.0.0; extra == 'all'
44
46
  Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'all'
45
47
  Requires-Dist: setuptools>=80.9.0; extra == 'all'
48
+ Requires-Dist: spacy>=3.8.7; extra == 'all'
46
49
  Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'all'
47
50
  Provides-Extra: api
48
- Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.1.6; extra == 'api'
51
+ Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.16.0; extra == 'api'
49
52
  Provides-Extra: chunking
50
53
  Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'chunking'
51
54
  Provides-Extra: cli
@@ -54,8 +57,13 @@ Requires-Dist: rich>=14.0.0; extra == 'cli'
54
57
  Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'cli'
55
58
  Provides-Extra: easyocr
56
59
  Requires-Dist: easyocr>=1.7.2; extra == 'easyocr'
60
+ Provides-Extra: entity-extraction
61
+ Requires-Dist: keybert>=0.9.0; extra == 'entity-extraction'
62
+ Requires-Dist: spacy>=3.8.7; extra == 'entity-extraction'
57
63
  Provides-Extra: gmft
58
64
  Requires-Dist: gmft>=0.4.2; extra == 'gmft'
65
+ Provides-Extra: langdetect
66
+ Requires-Dist: fast-langdetect>=0.3.2; extra == 'langdetect'
59
67
  Provides-Extra: paddleocr
60
68
  Requires-Dist: paddleocr>=3.1.0; extra == 'paddleocr'
61
69
  Requires-Dist: paddlepaddle>=3.1.0; extra == 'paddleocr'
@@ -1,50 +1,54 @@
1
- kreuzberg/__init__.py,sha256=5GP2j8PI3P_ZNSEhLpm8iqseY3i4nye6iUmVGUnfzno,1311
1
+ kreuzberg/__init__.py,sha256=wVxbug-w1cO2xHcP04Bf6QeIKmT2Ep6aeenb8EOYLA0,1534
2
2
  kreuzberg/__main__.py,sha256=s2qM1nPEkRHAQP-G3P7sf5l6qA_KJeIEHS5LpPz04lg,183
3
3
  kreuzberg/_chunker.py,sha256=2eHSRHcZdJ2ZjR3in49y3o9tPl5HMO3vkbnMqaVCbHI,1887
4
4
  kreuzberg/_cli_config.py,sha256=WD_seFjbuay_NJv77vGLBW6BVV9WZNujdzf3zQkhzPc,5691
5
5
  kreuzberg/_constants.py,sha256=Bxc8oiN-wHwnWXT9bEiJhTUcu1ygPpra5qHirAif3b4,191
6
- kreuzberg/_gmft.py,sha256=6liCjedPxH5Xbe7V-AmrZIq5Y9Dejn7D-LSCbgYs2Sg,14762
6
+ kreuzberg/_entity_extraction.py,sha256=EIasBGpkZ-3FwivjEpisz23LilTwx8os-IbfrDtzNl4,7815
7
+ kreuzberg/_gmft.py,sha256=e-UpYwizRX_V-dn0a7ja0Z9nShAmDKA1Q7HThJy8cyA,14856
8
+ kreuzberg/_language_detection.py,sha256=22-uXoOu_ws0K8Hz2M7U_SF9QX3npRYLhntAE1dNLFU,3283
7
9
  kreuzberg/_mime_types.py,sha256=QgX-k8aI4lTKArObDM0TFPt7DUjUVwWrdIaIZDh_XQY,7815
8
10
  kreuzberg/_playa.py,sha256=rU6ii2Qnrj8tkDYlSiab5h-BCYLJnUg4QwSLVDEXF5g,11883
9
11
  kreuzberg/_registry.py,sha256=c2B_PJbaL0q3ab2eNmj_0jldeyMaqgvRwkZqUU4MM5Q,3290
10
- kreuzberg/_types.py,sha256=8kwDjQjBdiTbNcRwJmH4vijNpf9Ml9WNW85Uxv2alDw,7634
12
+ kreuzberg/_types.py,sha256=U72a4SXS1e-zV8cXG0tiozMy9mX9wFM1ma6sVz7HpJo,9936
11
13
  kreuzberg/cli.py,sha256=S0w2nGXBWPFn1NhxppW7dpUwB9f_3ymFuWSAB2aRu9g,12465
12
14
  kreuzberg/exceptions.py,sha256=xRaiJh11i8E6Nc-gAQPgNW5xvhiiFBhRS-CBbCEbHQM,2881
13
- kreuzberg/extraction.py,sha256=z8sht8Yw9v6bE_WgLdWx-phu4T58eExME296DV_41VU,16551
15
+ kreuzberg/extraction.py,sha256=mdH45bMAAUUNXYT7UrNyWJ2oD_gXuLUU-NyuYxQM884,17459
14
16
  kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
17
  kreuzberg/_api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
18
  kreuzberg/_api/main.py,sha256=kZCMPPzP4BGzEege9pdhQTJPKKVjCaC6kZdMMeaqP2M,2599
17
19
  kreuzberg/_extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
20
  kreuzberg/_extractors/_base.py,sha256=YUr6A2n34LlFzbYQkiKqhXAphL9RYrvAls5SlkoQqNg,3028
19
21
  kreuzberg/_extractors/_html.py,sha256=L_vcKyv1ObI6irPaD3-WTKqxeRfZA4Rhsl3zUiAe_ws,1312
20
- kreuzberg/_extractors/_image.py,sha256=Vks6WEDoW5AlGqIGVSeuhZzvJNwS8V6wxeD46Fxxogw,3947
22
+ kreuzberg/_extractors/_image.py,sha256=pYfh3x9CkiIxOLvp0jkkZcmLbB_FpdfDo01klSc6OzQ,4819
21
23
  kreuzberg/_extractors/_pandoc.py,sha256=oQ4DgQSPoX1LXjGAKh_A40JHqiKWb91LeRBYSS_6EUA,26750
22
- kreuzberg/_extractors/_pdf.py,sha256=qgYwGvAlvyZzb94lXGcKGIhzmSFpP6YGzYc7fs8b-yw,13432
24
+ kreuzberg/_extractors/_pdf.py,sha256=R33ggTd0IU6NsEnzgHFTr9ScgcnM8nIIstDq7XMVcvg,14792
23
25
  kreuzberg/_extractors/_presentation.py,sha256=ZX-EKQppHwvKtyKk0-IQVF6QAqJi0SfGgCiiyqMQh0w,8701
24
- kreuzberg/_extractors/_spread_sheet.py,sha256=ToLZIK_PO72IYbsdtSQkHOwTUhDwptjOfSX--e1UdSM,6487
26
+ kreuzberg/_extractors/_spread_sheet.py,sha256=HOzCeYQc6kaMveAHfi80LrsF0yU7Kn74aKQ7lrMAlo8,6480
25
27
  kreuzberg/_multiprocessing/__init__.py,sha256=nwYQpKH7ixHwzkQbTMFCstOCBKktmbNq5dTrwI2Mn94,203
26
- kreuzberg/_multiprocessing/gmft_isolated.py,sha256=wpZ5br5dL9P6hhGjAYckHbz8IvXrDdEvajJ7fxbFmAU,11199
27
- kreuzberg/_multiprocessing/process_manager.py,sha256=dvO9JBWYnH1KCpzwn9h3Tz-wAoihMwTLE6OS-DF_sK0,6030
28
+ kreuzberg/_multiprocessing/gmft_isolated.py,sha256=ZfbhiL5bhBEJnibUSls3WV-FECrnU9VvKfq5O2foHcc,11191
29
+ kreuzberg/_multiprocessing/process_manager.py,sha256=_qtB8y9td2coJevlIl4z6F__jau320RdI1lqdyuaeD4,6061
30
+ kreuzberg/_multiprocessing/sync_easyocr.py,sha256=-3_Ol0H8G6RhPxTbTPvoe8fTsTz3e-dg2QbHHnoJL48,7693
31
+ kreuzberg/_multiprocessing/sync_paddleocr.py,sha256=5558iTjPXCyJWuyhZckmuJLadUwJDb5YVC8Cv-FOaWg,6090
28
32
  kreuzberg/_multiprocessing/sync_tesseract.py,sha256=Ck1PvHGWOMQWUcC7RyVrBt8K9VDFQ0lQcwFkwYzl3rE,8240
29
33
  kreuzberg/_multiprocessing/tesseract_pool.py,sha256=UN7BtS_ib1ux9xuR6d6AB3PY7UEUhd-5Ti1n1H0UnYw,10945
30
34
  kreuzberg/_ocr/__init__.py,sha256=VTqwKDlIRbjve71Y11Ztygyhv5aWG9LWTj8iX66ANxE,533
31
35
  kreuzberg/_ocr/_base.py,sha256=lNT0Tin4hzbmaamqqySxvYEwNtrJB5gGlStrANQQcyc,1637
32
- kreuzberg/_ocr/_easyocr.py,sha256=QSd6Bw7RBsOyL5ry-6lFLD7gJxcpK1P3AD_RRK4TPWs,13734
36
+ kreuzberg/_ocr/_easyocr.py,sha256=90Dv1xaLXbpG7EtmRQE5ykvnhqZJR3xSFXlxFMCSVSI,13740
33
37
  kreuzberg/_ocr/_paddleocr.py,sha256=UvugDdZd7RojHUiFeBaI8aqz36ecegPLj2v6oT6c42g,13776
34
- kreuzberg/_ocr/_tesseract.py,sha256=NAHklkHvDKMgHVqjhgYfxC3DIJuQn8fXPkvnmQxUiV8,12784
38
+ kreuzberg/_ocr/_tesseract.py,sha256=3s3MkZN9xA_Uedx4s2p5m4IEIMhGjs9gYHxan9Iz-2g,13044
35
39
  kreuzberg/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
36
40
  kreuzberg/_utils/_cache.py,sha256=JGiwwcNBoD950IbsPUUAD5gAGS7byUuz0BqYSneVakc,13088
37
- kreuzberg/_utils/_device.py,sha256=Dk4g-LzUMJ-WMM-9czNQJj3mUI43l2w7t6MJcERYb2U,10264
41
+ kreuzberg/_utils/_device.py,sha256=rnaSSB5ibf2wr7EDxrcmOUZ4Ocor0pHkwb3N1pC46EY,10276
38
42
  kreuzberg/_utils/_document_cache.py,sha256=z8irioKsOu8xve1YgHatm__wIFvs9I1gDK3tLNsNyqM,6926
39
43
  kreuzberg/_utils/_errors.py,sha256=AV3oaRQDgJxe1YUZd9pCQUysUv9KW8Ib37MvnyFOZ4o,6386
40
44
  kreuzberg/_utils/_pdf_lock.py,sha256=nqxAYCNlfWDrJtP4ZNu57st1YnkDl-gYXdr0q8nv0kA,1961
41
- kreuzberg/_utils/_process_pool.py,sha256=7n5UN3d-xeYHU5TiRI62u-JenERPinJzFhbRUq-zL9k,2895
45
+ kreuzberg/_utils/_process_pool.py,sha256=-0SNP01Qz21D7hgJmN0eHoqKusSygwPbi1U7IzJlPio,2895
42
46
  kreuzberg/_utils/_serialization.py,sha256=AhZvyAu4KsjAqyZDh--Kn2kSWGgCuH7udio8lTklO0g,2132
43
47
  kreuzberg/_utils/_string.py,sha256=owIVkUtP0__GiJD9RIJzPdvyIigT5sQho3mOXPbsnW0,958
44
- kreuzberg/_utils/_sync.py,sha256=IsKkR_YmseZKY6Asz6w3k-dgMXcrVaI06jWfDY7Bol4,4842
45
- kreuzberg/_utils/_tmp.py,sha256=5rqG_Nlb9xweaLqJA8Kc5csHDase9_eY_Fq93rNQGWc,1044
46
- kreuzberg-3.4.2.dist-info/METADATA,sha256=OW-2YLyJIE3TKPZ1R-7T8NuS5aaTPRFaYP4-h7D3efw,8702
47
- kreuzberg-3.4.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
48
- kreuzberg-3.4.2.dist-info/entry_points.txt,sha256=VdoFaTl3QSvVWOZcIlPpDd47o6kn7EvmXSs8FI0ExLc,48
49
- kreuzberg-3.4.2.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
50
- kreuzberg-3.4.2.dist-info/RECORD,,
48
+ kreuzberg/_utils/_sync.py,sha256=oT4Y_cDBKtE_BFEoLTae3rSisqlYXzW-jlUG_x-dmLM,4725
49
+ kreuzberg/_utils/_tmp.py,sha256=hVn-VVijIg2FM7EZJ899gc7wZg-TGoJZoeAcxMX-Cxg,1044
50
+ kreuzberg-3.6.0.dist-info/METADATA,sha256=zlqw5yTQit-jYeZVnM27kPsn2mCfulpL8wssptrQR8Q,9160
51
+ kreuzberg-3.6.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
52
+ kreuzberg-3.6.0.dist-info/entry_points.txt,sha256=VdoFaTl3QSvVWOZcIlPpDd47o6kn7EvmXSs8FI0ExLc,48
53
+ kreuzberg-3.6.0.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
54
+ kreuzberg-3.6.0.dist-info/RECORD,,