kreuzberg 3.4.2__py3-none-any.whl → 3.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +6 -1
- kreuzberg/_entity_extraction.py +239 -0
- kreuzberg/_extractors/_image.py +21 -1
- kreuzberg/_extractors/_pdf.py +44 -14
- kreuzberg/_extractors/_spread_sheet.py +2 -2
- kreuzberg/_gmft.py +4 -4
- kreuzberg/_language_detection.py +95 -0
- kreuzberg/_multiprocessing/gmft_isolated.py +2 -4
- kreuzberg/_multiprocessing/process_manager.py +2 -1
- kreuzberg/_multiprocessing/sync_easyocr.py +235 -0
- kreuzberg/_multiprocessing/sync_paddleocr.py +199 -0
- kreuzberg/_ocr/_easyocr.py +1 -1
- kreuzberg/_ocr/_tesseract.py +7 -3
- kreuzberg/_types.py +46 -4
- kreuzberg/_utils/_device.py +2 -2
- kreuzberg/_utils/_process_pool.py +2 -2
- kreuzberg/_utils/_sync.py +1 -5
- kreuzberg/_utils/_tmp.py +2 -2
- kreuzberg/extraction.py +39 -12
- {kreuzberg-3.4.2.dist-info → kreuzberg-3.6.0.dist-info}/METADATA +12 -4
- {kreuzberg-3.4.2.dist-info → kreuzberg-3.6.0.dist-info}/RECORD +24 -20
- {kreuzberg-3.4.2.dist-info → kreuzberg-3.6.0.dist-info}/WHEEL +0 -0
- {kreuzberg-3.4.2.dist-info → kreuzberg-3.6.0.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.4.2.dist-info → kreuzberg-3.6.0.dist-info}/licenses/LICENSE +0 -0
kreuzberg/extraction.py
CHANGED
@@ -1,12 +1,14 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
from pathlib import Path
|
4
|
-
from typing import TYPE_CHECKING, Final, cast
|
4
|
+
from typing import TYPE_CHECKING, Any, Final, cast
|
5
5
|
|
6
6
|
import anyio
|
7
7
|
|
8
8
|
from kreuzberg import ExtractionResult
|
9
9
|
from kreuzberg._chunker import get_chunker
|
10
|
+
from kreuzberg._entity_extraction import extract_entities, extract_keywords
|
11
|
+
from kreuzberg._language_detection import detect_languages
|
10
12
|
from kreuzberg._mime_types import (
|
11
13
|
validate_mime_type,
|
12
14
|
)
|
@@ -24,10 +26,7 @@ if TYPE_CHECKING:
|
|
24
26
|
DEFAULT_CONFIG: Final[ExtractionConfig] = ExtractionConfig()
|
25
27
|
|
26
28
|
|
27
|
-
|
28
|
-
for validator in config.validators or []:
|
29
|
-
await run_maybe_sync(validator, result)
|
30
|
-
|
29
|
+
def _validate_and_post_process_helper(result: ExtractionResult, config: ExtractionConfig) -> ExtractionResult:
|
31
30
|
if config.chunk_content:
|
32
31
|
result.chunks = _handle_chunk_content(
|
33
32
|
mime_type=result.mime_type,
|
@@ -35,6 +34,39 @@ async def _validate_and_post_process_async(result: ExtractionResult, config: Ext
|
|
35
34
|
content=result.content,
|
36
35
|
)
|
37
36
|
|
37
|
+
if config.extract_entities:
|
38
|
+
try:
|
39
|
+
result.entities = extract_entities(
|
40
|
+
result.content,
|
41
|
+
custom_patterns=config.custom_entity_patterns,
|
42
|
+
)
|
43
|
+
except RuntimeError:
|
44
|
+
result.entities = None
|
45
|
+
|
46
|
+
if config.extract_keywords:
|
47
|
+
try:
|
48
|
+
result.keywords = extract_keywords(
|
49
|
+
result.content,
|
50
|
+
keyword_count=config.keyword_count,
|
51
|
+
)
|
52
|
+
except RuntimeError:
|
53
|
+
result.keywords = None
|
54
|
+
|
55
|
+
if config.auto_detect_language:
|
56
|
+
result.detected_languages = detect_languages(
|
57
|
+
result.content,
|
58
|
+
config=config.language_detection_config,
|
59
|
+
)
|
60
|
+
|
61
|
+
return result
|
62
|
+
|
63
|
+
|
64
|
+
async def _validate_and_post_process_async(result: ExtractionResult, config: ExtractionConfig) -> ExtractionResult:
|
65
|
+
for validator in config.validators or []:
|
66
|
+
await run_maybe_sync(validator, result)
|
67
|
+
|
68
|
+
result = _validate_and_post_process_helper(result, config)
|
69
|
+
|
38
70
|
for post_processor in config.post_processing_hooks or []:
|
39
71
|
result = await run_maybe_sync(post_processor, result)
|
40
72
|
|
@@ -45,12 +77,7 @@ def _validate_and_post_process_sync(result: ExtractionResult, config: Extraction
|
|
45
77
|
for validator in config.validators or []:
|
46
78
|
run_sync_only(validator, result)
|
47
79
|
|
48
|
-
|
49
|
-
result.chunks = _handle_chunk_content(
|
50
|
-
mime_type=result.mime_type,
|
51
|
-
config=config,
|
52
|
-
content=result.content,
|
53
|
-
)
|
80
|
+
result = _validate_and_post_process_helper(result, config)
|
54
81
|
|
55
82
|
for post_processor in config.post_processing_hooks or []:
|
56
83
|
result = run_sync_only(post_processor, result)
|
@@ -62,7 +89,7 @@ def _handle_chunk_content(
|
|
62
89
|
mime_type: str,
|
63
90
|
config: ExtractionConfig,
|
64
91
|
content: str,
|
65
|
-
) ->
|
92
|
+
) -> Any:
|
66
93
|
chunker = get_chunker(mime_type=mime_type, max_characters=config.max_chars, overlap_characters=config.max_overlap)
|
67
94
|
return chunker.chunks(content)
|
68
95
|
|
@@ -1,12 +1,12 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 3.
|
3
|
+
Version: 3.6.0
|
4
4
|
Summary: A text extraction library supporting PDFs, images, office documents and more
|
5
5
|
Project-URL: homepage, https://github.com/Goldziher/kreuzberg
|
6
6
|
Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
|
7
7
|
License: MIT
|
8
8
|
License-File: LICENSE
|
9
|
-
Keywords: document-processing,image-to-text,ocr,pandoc,pdf-extraction,rag,table-extraction,tesseract,text-extraction,text-processing
|
9
|
+
Keywords: document-processing,entity-extraction,image-to-text,keyword-extraction,named-entity-recognition,ner,ocr,pandoc,pdf-extraction,rag,spacy,table-extraction,tesseract,text-extraction,text-processing
|
10
10
|
Classifier: Development Status :: 5 - Production/Stable
|
11
11
|
Classifier: Intended Audience :: Developers
|
12
12
|
Classifier: License :: OSI Approved :: MIT License
|
@@ -36,16 +36,19 @@ Requires-Dist: typing-extensions>=4.14.0; python_version < '3.12'
|
|
36
36
|
Provides-Extra: all
|
37
37
|
Requires-Dist: click>=8.2.1; extra == 'all'
|
38
38
|
Requires-Dist: easyocr>=1.7.2; extra == 'all'
|
39
|
+
Requires-Dist: fast-langdetect>=0.3.2; extra == 'all'
|
39
40
|
Requires-Dist: gmft>=0.4.2; extra == 'all'
|
40
|
-
Requires-Dist:
|
41
|
+
Requires-Dist: keybert>=0.9.0; extra == 'all'
|
42
|
+
Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.16.0; extra == 'all'
|
41
43
|
Requires-Dist: paddleocr>=3.1.0; extra == 'all'
|
42
44
|
Requires-Dist: paddlepaddle>=3.1.0; extra == 'all'
|
43
45
|
Requires-Dist: rich>=14.0.0; extra == 'all'
|
44
46
|
Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'all'
|
45
47
|
Requires-Dist: setuptools>=80.9.0; extra == 'all'
|
48
|
+
Requires-Dist: spacy>=3.8.7; extra == 'all'
|
46
49
|
Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'all'
|
47
50
|
Provides-Extra: api
|
48
|
-
Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.
|
51
|
+
Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.16.0; extra == 'api'
|
49
52
|
Provides-Extra: chunking
|
50
53
|
Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'chunking'
|
51
54
|
Provides-Extra: cli
|
@@ -54,8 +57,13 @@ Requires-Dist: rich>=14.0.0; extra == 'cli'
|
|
54
57
|
Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'cli'
|
55
58
|
Provides-Extra: easyocr
|
56
59
|
Requires-Dist: easyocr>=1.7.2; extra == 'easyocr'
|
60
|
+
Provides-Extra: entity-extraction
|
61
|
+
Requires-Dist: keybert>=0.9.0; extra == 'entity-extraction'
|
62
|
+
Requires-Dist: spacy>=3.8.7; extra == 'entity-extraction'
|
57
63
|
Provides-Extra: gmft
|
58
64
|
Requires-Dist: gmft>=0.4.2; extra == 'gmft'
|
65
|
+
Provides-Extra: langdetect
|
66
|
+
Requires-Dist: fast-langdetect>=0.3.2; extra == 'langdetect'
|
59
67
|
Provides-Extra: paddleocr
|
60
68
|
Requires-Dist: paddleocr>=3.1.0; extra == 'paddleocr'
|
61
69
|
Requires-Dist: paddlepaddle>=3.1.0; extra == 'paddleocr'
|
@@ -1,50 +1,54 @@
|
|
1
|
-
kreuzberg/__init__.py,sha256=
|
1
|
+
kreuzberg/__init__.py,sha256=wVxbug-w1cO2xHcP04Bf6QeIKmT2Ep6aeenb8EOYLA0,1534
|
2
2
|
kreuzberg/__main__.py,sha256=s2qM1nPEkRHAQP-G3P7sf5l6qA_KJeIEHS5LpPz04lg,183
|
3
3
|
kreuzberg/_chunker.py,sha256=2eHSRHcZdJ2ZjR3in49y3o9tPl5HMO3vkbnMqaVCbHI,1887
|
4
4
|
kreuzberg/_cli_config.py,sha256=WD_seFjbuay_NJv77vGLBW6BVV9WZNujdzf3zQkhzPc,5691
|
5
5
|
kreuzberg/_constants.py,sha256=Bxc8oiN-wHwnWXT9bEiJhTUcu1ygPpra5qHirAif3b4,191
|
6
|
-
kreuzberg/
|
6
|
+
kreuzberg/_entity_extraction.py,sha256=EIasBGpkZ-3FwivjEpisz23LilTwx8os-IbfrDtzNl4,7815
|
7
|
+
kreuzberg/_gmft.py,sha256=e-UpYwizRX_V-dn0a7ja0Z9nShAmDKA1Q7HThJy8cyA,14856
|
8
|
+
kreuzberg/_language_detection.py,sha256=22-uXoOu_ws0K8Hz2M7U_SF9QX3npRYLhntAE1dNLFU,3283
|
7
9
|
kreuzberg/_mime_types.py,sha256=QgX-k8aI4lTKArObDM0TFPt7DUjUVwWrdIaIZDh_XQY,7815
|
8
10
|
kreuzberg/_playa.py,sha256=rU6ii2Qnrj8tkDYlSiab5h-BCYLJnUg4QwSLVDEXF5g,11883
|
9
11
|
kreuzberg/_registry.py,sha256=c2B_PJbaL0q3ab2eNmj_0jldeyMaqgvRwkZqUU4MM5Q,3290
|
10
|
-
kreuzberg/_types.py,sha256=
|
12
|
+
kreuzberg/_types.py,sha256=U72a4SXS1e-zV8cXG0tiozMy9mX9wFM1ma6sVz7HpJo,9936
|
11
13
|
kreuzberg/cli.py,sha256=S0w2nGXBWPFn1NhxppW7dpUwB9f_3ymFuWSAB2aRu9g,12465
|
12
14
|
kreuzberg/exceptions.py,sha256=xRaiJh11i8E6Nc-gAQPgNW5xvhiiFBhRS-CBbCEbHQM,2881
|
13
|
-
kreuzberg/extraction.py,sha256=
|
15
|
+
kreuzberg/extraction.py,sha256=mdH45bMAAUUNXYT7UrNyWJ2oD_gXuLUU-NyuYxQM884,17459
|
14
16
|
kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
15
17
|
kreuzberg/_api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
16
18
|
kreuzberg/_api/main.py,sha256=kZCMPPzP4BGzEege9pdhQTJPKKVjCaC6kZdMMeaqP2M,2599
|
17
19
|
kreuzberg/_extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
18
20
|
kreuzberg/_extractors/_base.py,sha256=YUr6A2n34LlFzbYQkiKqhXAphL9RYrvAls5SlkoQqNg,3028
|
19
21
|
kreuzberg/_extractors/_html.py,sha256=L_vcKyv1ObI6irPaD3-WTKqxeRfZA4Rhsl3zUiAe_ws,1312
|
20
|
-
kreuzberg/_extractors/_image.py,sha256=
|
22
|
+
kreuzberg/_extractors/_image.py,sha256=pYfh3x9CkiIxOLvp0jkkZcmLbB_FpdfDo01klSc6OzQ,4819
|
21
23
|
kreuzberg/_extractors/_pandoc.py,sha256=oQ4DgQSPoX1LXjGAKh_A40JHqiKWb91LeRBYSS_6EUA,26750
|
22
|
-
kreuzberg/_extractors/_pdf.py,sha256=
|
24
|
+
kreuzberg/_extractors/_pdf.py,sha256=R33ggTd0IU6NsEnzgHFTr9ScgcnM8nIIstDq7XMVcvg,14792
|
23
25
|
kreuzberg/_extractors/_presentation.py,sha256=ZX-EKQppHwvKtyKk0-IQVF6QAqJi0SfGgCiiyqMQh0w,8701
|
24
|
-
kreuzberg/_extractors/_spread_sheet.py,sha256=
|
26
|
+
kreuzberg/_extractors/_spread_sheet.py,sha256=HOzCeYQc6kaMveAHfi80LrsF0yU7Kn74aKQ7lrMAlo8,6480
|
25
27
|
kreuzberg/_multiprocessing/__init__.py,sha256=nwYQpKH7ixHwzkQbTMFCstOCBKktmbNq5dTrwI2Mn94,203
|
26
|
-
kreuzberg/_multiprocessing/gmft_isolated.py,sha256=
|
27
|
-
kreuzberg/_multiprocessing/process_manager.py,sha256=
|
28
|
+
kreuzberg/_multiprocessing/gmft_isolated.py,sha256=ZfbhiL5bhBEJnibUSls3WV-FECrnU9VvKfq5O2foHcc,11191
|
29
|
+
kreuzberg/_multiprocessing/process_manager.py,sha256=_qtB8y9td2coJevlIl4z6F__jau320RdI1lqdyuaeD4,6061
|
30
|
+
kreuzberg/_multiprocessing/sync_easyocr.py,sha256=-3_Ol0H8G6RhPxTbTPvoe8fTsTz3e-dg2QbHHnoJL48,7693
|
31
|
+
kreuzberg/_multiprocessing/sync_paddleocr.py,sha256=5558iTjPXCyJWuyhZckmuJLadUwJDb5YVC8Cv-FOaWg,6090
|
28
32
|
kreuzberg/_multiprocessing/sync_tesseract.py,sha256=Ck1PvHGWOMQWUcC7RyVrBt8K9VDFQ0lQcwFkwYzl3rE,8240
|
29
33
|
kreuzberg/_multiprocessing/tesseract_pool.py,sha256=UN7BtS_ib1ux9xuR6d6AB3PY7UEUhd-5Ti1n1H0UnYw,10945
|
30
34
|
kreuzberg/_ocr/__init__.py,sha256=VTqwKDlIRbjve71Y11Ztygyhv5aWG9LWTj8iX66ANxE,533
|
31
35
|
kreuzberg/_ocr/_base.py,sha256=lNT0Tin4hzbmaamqqySxvYEwNtrJB5gGlStrANQQcyc,1637
|
32
|
-
kreuzberg/_ocr/_easyocr.py,sha256=
|
36
|
+
kreuzberg/_ocr/_easyocr.py,sha256=90Dv1xaLXbpG7EtmRQE5ykvnhqZJR3xSFXlxFMCSVSI,13740
|
33
37
|
kreuzberg/_ocr/_paddleocr.py,sha256=UvugDdZd7RojHUiFeBaI8aqz36ecegPLj2v6oT6c42g,13776
|
34
|
-
kreuzberg/_ocr/_tesseract.py,sha256=
|
38
|
+
kreuzberg/_ocr/_tesseract.py,sha256=3s3MkZN9xA_Uedx4s2p5m4IEIMhGjs9gYHxan9Iz-2g,13044
|
35
39
|
kreuzberg/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
36
40
|
kreuzberg/_utils/_cache.py,sha256=JGiwwcNBoD950IbsPUUAD5gAGS7byUuz0BqYSneVakc,13088
|
37
|
-
kreuzberg/_utils/_device.py,sha256=
|
41
|
+
kreuzberg/_utils/_device.py,sha256=rnaSSB5ibf2wr7EDxrcmOUZ4Ocor0pHkwb3N1pC46EY,10276
|
38
42
|
kreuzberg/_utils/_document_cache.py,sha256=z8irioKsOu8xve1YgHatm__wIFvs9I1gDK3tLNsNyqM,6926
|
39
43
|
kreuzberg/_utils/_errors.py,sha256=AV3oaRQDgJxe1YUZd9pCQUysUv9KW8Ib37MvnyFOZ4o,6386
|
40
44
|
kreuzberg/_utils/_pdf_lock.py,sha256=nqxAYCNlfWDrJtP4ZNu57st1YnkDl-gYXdr0q8nv0kA,1961
|
41
|
-
kreuzberg/_utils/_process_pool.py,sha256
|
45
|
+
kreuzberg/_utils/_process_pool.py,sha256=-0SNP01Qz21D7hgJmN0eHoqKusSygwPbi1U7IzJlPio,2895
|
42
46
|
kreuzberg/_utils/_serialization.py,sha256=AhZvyAu4KsjAqyZDh--Kn2kSWGgCuH7udio8lTklO0g,2132
|
43
47
|
kreuzberg/_utils/_string.py,sha256=owIVkUtP0__GiJD9RIJzPdvyIigT5sQho3mOXPbsnW0,958
|
44
|
-
kreuzberg/_utils/_sync.py,sha256=
|
45
|
-
kreuzberg/_utils/_tmp.py,sha256=
|
46
|
-
kreuzberg-3.
|
47
|
-
kreuzberg-3.
|
48
|
-
kreuzberg-3.
|
49
|
-
kreuzberg-3.
|
50
|
-
kreuzberg-3.
|
48
|
+
kreuzberg/_utils/_sync.py,sha256=oT4Y_cDBKtE_BFEoLTae3rSisqlYXzW-jlUG_x-dmLM,4725
|
49
|
+
kreuzberg/_utils/_tmp.py,sha256=hVn-VVijIg2FM7EZJ899gc7wZg-TGoJZoeAcxMX-Cxg,1044
|
50
|
+
kreuzberg-3.6.0.dist-info/METADATA,sha256=zlqw5yTQit-jYeZVnM27kPsn2mCfulpL8wssptrQR8Q,9160
|
51
|
+
kreuzberg-3.6.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
52
|
+
kreuzberg-3.6.0.dist-info/entry_points.txt,sha256=VdoFaTl3QSvVWOZcIlPpDd47o6kn7EvmXSs8FI0ExLc,48
|
53
|
+
kreuzberg-3.6.0.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
|
54
|
+
kreuzberg-3.6.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|