airbyte-cdk 6.13.1.dev4106__py3-none-any.whl → 6.13.1.dev4108__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +18 -2
- {airbyte_cdk-6.13.1.dev4106.dist-info → airbyte_cdk-6.13.1.dev4108.dist-info}/METADATA +1 -1
- {airbyte_cdk-6.13.1.dev4106.dist-info → airbyte_cdk-6.13.1.dev4108.dist-info}/RECORD +6 -6
- {airbyte_cdk-6.13.1.dev4106.dist-info → airbyte_cdk-6.13.1.dev4108.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-6.13.1.dev4106.dist-info → airbyte_cdk-6.13.1.dev4108.dist-info}/WHEEL +0 -0
- {airbyte_cdk-6.13.1.dev4106.dist-info → airbyte_cdk-6.13.1.dev4108.dist-info}/entry_points.txt +0 -0
@@ -42,16 +42,32 @@ from airbyte_cdk.utils.traced_exception import AirbyteTracedException
|
|
42
42
|
unstructured_partition_pdf = None
|
43
43
|
unstructured_partition_docx = None
|
44
44
|
unstructured_partition_pptx = None
|
45
|
-
|
45
|
+
|
46
|
+
|
47
|
+
def get_ntlk_temp_folder() -> str:
|
48
|
+
"""
|
49
|
+
For non-root connectors /tmp is not currently writable, but we should allow it in the future.
|
50
|
+
It's safe to use /airbyte for now. Fallback to /tmp for local development.
|
51
|
+
"""
|
52
|
+
try:
|
53
|
+
nltk_data_dir = "/airbyte/nltk_data"
|
54
|
+
os.makedirs(nltk_data_dir, exist_ok=True)
|
55
|
+
except OSError:
|
56
|
+
nltk_data_dir = "/tmp/nltk_data"
|
57
|
+
os.makedirs(nltk_data_dir, exist_ok=True)
|
58
|
+
return nltk_data_dir
|
59
|
+
|
46
60
|
|
47
61
|
try:
|
48
|
-
|
62
|
+
nltk_data_dir = get_ntlk_temp_folder()
|
49
63
|
nltk.data.path.append(nltk_data_dir)
|
50
64
|
nltk.data.find("tokenizers/punkt.zip")
|
51
65
|
nltk.data.find("tokenizers/punkt_tab.zip")
|
66
|
+
nltk.data.find("tokenizers/averaged_perceptron_tagger_eng.zip")
|
52
67
|
except LookupError:
|
53
68
|
nltk.download("punkt", download_dir=nltk_data_dir)
|
54
69
|
nltk.download("punkt_tab", download_dir=nltk_data_dir)
|
70
|
+
nltk.download("averaged_perceptron_tagger_eng", download_dir=nltk_data_dir)
|
55
71
|
|
56
72
|
|
57
73
|
def optional_decode(contents: Union[str, bytes]) -> str:
|
@@ -217,7 +217,7 @@ airbyte_cdk/sources/file_based/file_types/file_transfer.py,sha256=HyGRihJxcb_lEs
|
|
217
217
|
airbyte_cdk/sources/file_based/file_types/file_type_parser.py,sha256=JgpH21PrbRqwK92BJklZWvh2TndA6xZ-eP1LPMo44oQ,2832
|
218
218
|
airbyte_cdk/sources/file_based/file_types/jsonl_parser.py,sha256=GwyNyxmST4RX-XpXy7xVH0D-znYWWBmGv_pVAu95oHQ,5886
|
219
219
|
airbyte_cdk/sources/file_based/file_types/parquet_parser.py,sha256=XenFg5sJ-UBnIkSmsiNJRou11NO0zZXx-RXgPHMT2NA,10487
|
220
|
-
airbyte_cdk/sources/file_based/file_types/unstructured_parser.py,sha256=
|
220
|
+
airbyte_cdk/sources/file_based/file_types/unstructured_parser.py,sha256=FOZL-RiXc7sndfdYZtLVSR93c_eHlKaS_nv2KrqFu2E,19371
|
221
221
|
airbyte_cdk/sources/file_based/remote_file.py,sha256=yqRz93vPe8PBXLIMJ5W5u2JRlZRhg6sBrAjn3pPjJ8A,315
|
222
222
|
airbyte_cdk/sources/file_based/schema_helpers.py,sha256=Cf8FH1bDFP0qCDDfEYir_WjP4exXUnikz8hZ40y1Ek0,9601
|
223
223
|
airbyte_cdk/sources/file_based/schema_validation_policies/__init__.py,sha256=FkByIyEy56x2_awYnxGPqGaOp7zAzpAoRkPZHKySI9M,536
|
@@ -342,8 +342,8 @@ airbyte_cdk/utils/slice_hasher.py,sha256=-pHexlNYoWYPnXNH-M7HEbjmeJe9Zk7SJijdQ7d
|
|
342
342
|
airbyte_cdk/utils/spec_schema_transformations.py,sha256=-5HTuNsnDBAhj-oLeQXwpTGA0HdcjFOf2zTEMUTTg_Y,816
|
343
343
|
airbyte_cdk/utils/stream_status_utils.py,sha256=ZmBoiy5HVbUEHAMrUONxZvxnvfV9CesmQJLDTAIWnWw,1171
|
344
344
|
airbyte_cdk/utils/traced_exception.py,sha256=C8uIBuCL_E4WnBAOPSxBicD06JAldoN9fGsQDp463OY,6292
|
345
|
-
airbyte_cdk-6.13.1.
|
346
|
-
airbyte_cdk-6.13.1.
|
347
|
-
airbyte_cdk-6.13.1.
|
348
|
-
airbyte_cdk-6.13.1.
|
349
|
-
airbyte_cdk-6.13.1.
|
345
|
+
airbyte_cdk-6.13.1.dev4108.dist-info/LICENSE.txt,sha256=Wfe61S4BaGPj404v8lrAbvhjYR68SHlkzeYrg3_bbuM,1051
|
346
|
+
airbyte_cdk-6.13.1.dev4108.dist-info/METADATA,sha256=4MmYepPLObB4KNT5D7ttKKhwRdVaXdJnZMcOLewYXOU,6008
|
347
|
+
airbyte_cdk-6.13.1.dev4108.dist-info/WHEEL,sha256=IYZQI976HJqqOpQU6PHkJ8fb3tMNBFjg-Cn-pwAbaFM,88
|
348
|
+
airbyte_cdk-6.13.1.dev4108.dist-info/entry_points.txt,sha256=fj-e3PAQvsxsQzyyq8UkG1k8spunWnD4BAH2AwlR6NM,95
|
349
|
+
airbyte_cdk-6.13.1.dev4108.dist-info/RECORD,,
|
File without changes
|
File without changes
|
{airbyte_cdk-6.13.1.dev4106.dist-info → airbyte_cdk-6.13.1.dev4108.dist-info}/entry_points.txt
RENAMED
File without changes
|