unstructured-ingest 0.3.0__py3-none-any.whl → 0.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/connectors/elasticsearch/__init__.py +0 -0
- test/integration/connectors/elasticsearch/conftest.py +34 -0
- test/integration/connectors/elasticsearch/test_elasticsearch.py +308 -0
- test/integration/connectors/elasticsearch/test_opensearch.py +302 -0
- test/integration/connectors/sql/test_postgres.py +10 -4
- test/integration/connectors/sql/test_singlestore.py +8 -4
- test/integration/connectors/sql/test_snowflake.py +10 -6
- test/integration/connectors/sql/test_sqlite.py +4 -4
- test/integration/connectors/test_astradb.py +50 -3
- test/integration/connectors/test_delta_table.py +46 -0
- test/integration/connectors/test_kafka.py +40 -6
- test/integration/connectors/test_lancedb.py +210 -0
- test/integration/connectors/test_milvus.py +141 -0
- test/integration/connectors/test_mongodb.py +332 -0
- test/integration/connectors/test_pinecone.py +53 -1
- test/integration/connectors/utils/docker.py +81 -15
- test/integration/connectors/utils/validation.py +10 -0
- test/integration/connectors/weaviate/__init__.py +0 -0
- test/integration/connectors/weaviate/conftest.py +15 -0
- test/integration/connectors/weaviate/test_local.py +131 -0
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/pipeline/reformat/embedding.py +1 -1
- unstructured_ingest/utils/data_prep.py +9 -1
- unstructured_ingest/v2/processes/connectors/__init__.py +3 -16
- unstructured_ingest/v2/processes/connectors/astradb.py +2 -2
- unstructured_ingest/v2/processes/connectors/azure_ai_search.py +4 -0
- unstructured_ingest/v2/processes/connectors/delta_table.py +20 -4
- unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py +19 -0
- unstructured_ingest/v2/processes/connectors/{elasticsearch.py → elasticsearch/elasticsearch.py} +92 -46
- unstructured_ingest/v2/processes/connectors/{opensearch.py → elasticsearch/opensearch.py} +1 -1
- unstructured_ingest/v2/processes/connectors/google_drive.py +1 -1
- unstructured_ingest/v2/processes/connectors/kafka/kafka.py +6 -0
- unstructured_ingest/v2/processes/connectors/lancedb/__init__.py +17 -0
- unstructured_ingest/v2/processes/connectors/lancedb/aws.py +43 -0
- unstructured_ingest/v2/processes/connectors/lancedb/azure.py +43 -0
- unstructured_ingest/v2/processes/connectors/lancedb/gcp.py +44 -0
- unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +161 -0
- unstructured_ingest/v2/processes/connectors/lancedb/local.py +44 -0
- unstructured_ingest/v2/processes/connectors/milvus.py +72 -27
- unstructured_ingest/v2/processes/connectors/mongodb.py +122 -111
- unstructured_ingest/v2/processes/connectors/pinecone.py +24 -7
- unstructured_ingest/v2/processes/connectors/sql/sql.py +97 -26
- unstructured_ingest/v2/processes/connectors/weaviate/__init__.py +25 -0
- unstructured_ingest/v2/processes/connectors/weaviate/cloud.py +164 -0
- unstructured_ingest/v2/processes/connectors/weaviate/embedded.py +90 -0
- unstructured_ingest/v2/processes/connectors/weaviate/local.py +73 -0
- unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +299 -0
- {unstructured_ingest-0.3.0.dist-info → unstructured_ingest-0.3.2.dist-info}/METADATA +19 -19
- {unstructured_ingest-0.3.0.dist-info → unstructured_ingest-0.3.2.dist-info}/RECORD +54 -33
- unstructured_ingest/v2/processes/connectors/weaviate.py +0 -242
- /test/integration/connectors/{test_azure_cog_search.py → test_azure_ai_search.py} +0 -0
- {unstructured_ingest-0.3.0.dist-info → unstructured_ingest-0.3.2.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.3.0.dist-info → unstructured_ingest-0.3.2.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.3.0.dist-info → unstructured_ingest-0.3.2.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.3.0.dist-info → unstructured_ingest-0.3.2.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: unstructured-ingest
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.2
|
|
4
4
|
Summary: A library that prepares raw documents for downstream ML tasks.
|
|
5
5
|
Home-page: https://github.com/Unstructured-IO/unstructured-ingest
|
|
6
6
|
Author: Unstructured Technologies
|
|
@@ -22,37 +22,37 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
|
22
22
|
Requires-Python: >=3.9.0,<3.13
|
|
23
23
|
Description-Content-Type: text/markdown
|
|
24
24
|
License-File: LICENSE.md
|
|
25
|
-
Requires-Dist: tqdm
|
|
26
|
-
Requires-Dist: pandas
|
|
27
|
-
Requires-Dist: dataclasses-json
|
|
28
|
-
Requires-Dist: opentelemetry-sdk
|
|
29
25
|
Requires-Dist: python-dateutil
|
|
30
26
|
Requires-Dist: pydantic>=2.7
|
|
27
|
+
Requires-Dist: opentelemetry-sdk
|
|
31
28
|
Requires-Dist: click
|
|
29
|
+
Requires-Dist: tqdm
|
|
30
|
+
Requires-Dist: pandas
|
|
31
|
+
Requires-Dist: dataclasses-json
|
|
32
32
|
Provides-Extra: airtable
|
|
33
33
|
Requires-Dist: pyairtable; extra == "airtable"
|
|
34
34
|
Provides-Extra: astradb
|
|
35
35
|
Requires-Dist: astrapy; extra == "astradb"
|
|
36
36
|
Provides-Extra: azure
|
|
37
|
-
Requires-Dist: adlfs; extra == "azure"
|
|
38
37
|
Requires-Dist: fsspec; extra == "azure"
|
|
38
|
+
Requires-Dist: adlfs; extra == "azure"
|
|
39
39
|
Provides-Extra: azure-ai-search
|
|
40
40
|
Requires-Dist: azure-search-documents; extra == "azure-ai-search"
|
|
41
41
|
Provides-Extra: bedrock
|
|
42
42
|
Requires-Dist: boto3; extra == "bedrock"
|
|
43
43
|
Provides-Extra: biomed
|
|
44
|
-
Requires-Dist: requests; extra == "biomed"
|
|
45
44
|
Requires-Dist: bs4; extra == "biomed"
|
|
45
|
+
Requires-Dist: requests; extra == "biomed"
|
|
46
46
|
Provides-Extra: box
|
|
47
|
-
Requires-Dist: fsspec; extra == "box"
|
|
48
47
|
Requires-Dist: boxfs; extra == "box"
|
|
48
|
+
Requires-Dist: fsspec; extra == "box"
|
|
49
49
|
Provides-Extra: chroma
|
|
50
50
|
Requires-Dist: chromadb; extra == "chroma"
|
|
51
51
|
Provides-Extra: clarifai
|
|
52
52
|
Requires-Dist: clarifai; extra == "clarifai"
|
|
53
53
|
Provides-Extra: confluence
|
|
54
|
-
Requires-Dist: requests; extra == "confluence"
|
|
55
54
|
Requires-Dist: atlassian-python-api; extra == "confluence"
|
|
55
|
+
Requires-Dist: requests; extra == "confluence"
|
|
56
56
|
Provides-Extra: couchbase
|
|
57
57
|
Requires-Dist: couchbase; extra == "couchbase"
|
|
58
58
|
Provides-Extra: csv
|
|
@@ -69,8 +69,8 @@ Requires-Dist: unstructured[docx]; extra == "doc"
|
|
|
69
69
|
Provides-Extra: docx
|
|
70
70
|
Requires-Dist: unstructured[docx]; extra == "docx"
|
|
71
71
|
Provides-Extra: dropbox
|
|
72
|
-
Requires-Dist: fsspec; extra == "dropbox"
|
|
73
72
|
Requires-Dist: dropboxdrivefs; extra == "dropbox"
|
|
73
|
+
Requires-Dist: fsspec; extra == "dropbox"
|
|
74
74
|
Provides-Extra: elasticsearch
|
|
75
75
|
Requires-Dist: elasticsearch[async]; extra == "elasticsearch"
|
|
76
76
|
Provides-Extra: embed-huggingface
|
|
@@ -78,8 +78,8 @@ Requires-Dist: sentence-transformers; extra == "embed-huggingface"
|
|
|
78
78
|
Provides-Extra: embed-mixedbreadai
|
|
79
79
|
Requires-Dist: mixedbread-ai; extra == "embed-mixedbreadai"
|
|
80
80
|
Provides-Extra: embed-octoai
|
|
81
|
-
Requires-Dist: openai; extra == "embed-octoai"
|
|
82
81
|
Requires-Dist: tiktoken; extra == "embed-octoai"
|
|
82
|
+
Requires-Dist: openai; extra == "embed-octoai"
|
|
83
83
|
Provides-Extra: embed-vertexai
|
|
84
84
|
Requires-Dist: vertexai; extra == "embed-vertexai"
|
|
85
85
|
Provides-Extra: embed-voyageai
|
|
@@ -87,19 +87,19 @@ Requires-Dist: voyageai; extra == "embed-voyageai"
|
|
|
87
87
|
Provides-Extra: epub
|
|
88
88
|
Requires-Dist: unstructured[epub]; extra == "epub"
|
|
89
89
|
Provides-Extra: gcs
|
|
90
|
-
Requires-Dist: fsspec; extra == "gcs"
|
|
91
90
|
Requires-Dist: bs4; extra == "gcs"
|
|
92
91
|
Requires-Dist: gcsfs; extra == "gcs"
|
|
92
|
+
Requires-Dist: fsspec; extra == "gcs"
|
|
93
93
|
Provides-Extra: github
|
|
94
|
-
Requires-Dist: pygithub>1.58.0; extra == "github"
|
|
95
94
|
Requires-Dist: requests; extra == "github"
|
|
95
|
+
Requires-Dist: pygithub>1.58.0; extra == "github"
|
|
96
96
|
Provides-Extra: gitlab
|
|
97
97
|
Requires-Dist: python-gitlab; extra == "gitlab"
|
|
98
98
|
Provides-Extra: google-drive
|
|
99
99
|
Requires-Dist: google-api-python-client; extra == "google-drive"
|
|
100
100
|
Provides-Extra: hubspot
|
|
101
|
-
Requires-Dist: hubspot-api-client; extra == "hubspot"
|
|
102
101
|
Requires-Dist: urllib3; extra == "hubspot"
|
|
102
|
+
Requires-Dist: hubspot-api-client; extra == "hubspot"
|
|
103
103
|
Provides-Extra: jira
|
|
104
104
|
Requires-Dist: atlassian-python-api; extra == "jira"
|
|
105
105
|
Provides-Extra: kafka
|
|
@@ -115,19 +115,19 @@ Requires-Dist: pymongo; extra == "mongodb"
|
|
|
115
115
|
Provides-Extra: msg
|
|
116
116
|
Requires-Dist: unstructured[msg]; extra == "msg"
|
|
117
117
|
Provides-Extra: notion
|
|
118
|
-
Requires-Dist: httpx; extra == "notion"
|
|
119
118
|
Requires-Dist: backoff; extra == "notion"
|
|
120
119
|
Requires-Dist: htmlBuilder; extra == "notion"
|
|
121
120
|
Requires-Dist: notion-client; extra == "notion"
|
|
121
|
+
Requires-Dist: httpx; extra == "notion"
|
|
122
122
|
Provides-Extra: odt
|
|
123
123
|
Requires-Dist: unstructured[odt]; extra == "odt"
|
|
124
124
|
Provides-Extra: onedrive
|
|
125
|
+
Requires-Dist: bs4; extra == "onedrive"
|
|
125
126
|
Requires-Dist: msal; extra == "onedrive"
|
|
126
127
|
Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
|
|
127
|
-
Requires-Dist: bs4; extra == "onedrive"
|
|
128
128
|
Provides-Extra: openai
|
|
129
|
-
Requires-Dist: openai; extra == "openai"
|
|
130
129
|
Requires-Dist: tiktoken; extra == "openai"
|
|
130
|
+
Requires-Dist: openai; extra == "openai"
|
|
131
131
|
Provides-Extra: opensearch
|
|
132
132
|
Requires-Dist: opensearch-py; extra == "opensearch"
|
|
133
133
|
Provides-Extra: org
|
|
@@ -156,8 +156,8 @@ Requires-Dist: unstructured[rst]; extra == "rst"
|
|
|
156
156
|
Provides-Extra: rtf
|
|
157
157
|
Requires-Dist: unstructured[rtf]; extra == "rtf"
|
|
158
158
|
Provides-Extra: s3
|
|
159
|
-
Requires-Dist: s3fs; extra == "s3"
|
|
160
159
|
Requires-Dist: fsspec; extra == "s3"
|
|
160
|
+
Requires-Dist: s3fs; extra == "s3"
|
|
161
161
|
Provides-Extra: salesforce
|
|
162
162
|
Requires-Dist: simple-salesforce; extra == "salesforce"
|
|
163
163
|
Provides-Extra: sftp
|
|
@@ -171,8 +171,8 @@ Requires-Dist: singlestoredb; extra == "singlestore"
|
|
|
171
171
|
Provides-Extra: slack
|
|
172
172
|
Requires-Dist: slack-sdk[optional]; extra == "slack"
|
|
173
173
|
Provides-Extra: snowflake
|
|
174
|
-
Requires-Dist: snowflake-connector-python; extra == "snowflake"
|
|
175
174
|
Requires-Dist: psycopg2-binary; extra == "snowflake"
|
|
175
|
+
Requires-Dist: snowflake-connector-python; extra == "snowflake"
|
|
176
176
|
Provides-Extra: togetherai
|
|
177
177
|
Requires-Dist: together; extra == "togetherai"
|
|
178
178
|
Provides-Extra: tsv
|
|
@@ -5,27 +5,37 @@ test/integration/chunkers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJW
|
|
|
5
5
|
test/integration/chunkers/test_chunkers.py,sha256=pqn1Rqh36jZTJL4qpU0iuOMFAEQ-LrKAPOgWtQMAt_I,1482
|
|
6
6
|
test/integration/connectors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
7
7
|
test/integration/connectors/conftest.py,sha256=6dVNMBrL6WIO4KXA-0nf2tNrPYk_tsor8uomi6fbi3Q,727
|
|
8
|
-
test/integration/connectors/test_astradb.py,sha256=
|
|
9
|
-
test/integration/connectors/
|
|
8
|
+
test/integration/connectors/test_astradb.py,sha256=QPFrODXmOHagpuKaiooxXb3OEW93w2g4fmq8BkaBCnY,5303
|
|
9
|
+
test/integration/connectors/test_azure_ai_search.py,sha256=dae4GifRiKue5YpsxworDiaMQoMsxcPDBithb6OFkx4,8876
|
|
10
10
|
test/integration/connectors/test_confluence.py,sha256=xcPmZ_vi_pkCt-tUPn10P49FH9i_9YUbrAPO6fYk5rU,3521
|
|
11
|
-
test/integration/connectors/test_delta_table.py,sha256=
|
|
12
|
-
test/integration/connectors/test_kafka.py,sha256=
|
|
11
|
+
test/integration/connectors/test_delta_table.py,sha256=GSzWIkbEUzOrRPt2F1uO0dabcp7kTFDj75BhhI2y-WU,6856
|
|
12
|
+
test/integration/connectors/test_kafka.py,sha256=j7jsNWZumNBv9v-5Bpx8geUUXpxxad5EuA4CMRsl4R8,7104
|
|
13
|
+
test/integration/connectors/test_lancedb.py,sha256=O3YF6MVBkCsCgklXCJe8Kpy8aKGfafASVH4PspmpcYs,7628
|
|
14
|
+
test/integration/connectors/test_milvus.py,sha256=CVmYw9iEeKT_0OtShxye2E6i1LbWzzDA8JtwJRkYQlA,4763
|
|
15
|
+
test/integration/connectors/test_mongodb.py,sha256=YeS_DUnVYN02F76j87W8RhXGHnJMzQYb3n-L1-oWGXI,12254
|
|
13
16
|
test/integration/connectors/test_onedrive.py,sha256=KIkBwKh1hnv203VCL2UABnDkS_bP4NxOFm1AL8EPGLA,3554
|
|
14
|
-
test/integration/connectors/test_pinecone.py,sha256=
|
|
17
|
+
test/integration/connectors/test_pinecone.py,sha256=X10OWZ6IrO6YyhuR3ydMAZOQq3u2f5u_lCjKNYUUcnI,7558
|
|
15
18
|
test/integration/connectors/test_qdrant.py,sha256=ASvO-BNyhv8m8or28KljrJy27Da0uaTNeoR5w_QsvFg,5121
|
|
16
19
|
test/integration/connectors/test_s3.py,sha256=YHEYMqWTKTfR7wlL4VoxtgMs1YiYKyhLIBdG-anaQGo,6896
|
|
17
20
|
test/integration/connectors/databricks_tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
18
21
|
test/integration/connectors/databricks_tests/test_volumes_native.py,sha256=k4lALbwNtlyuI3wd3OHoBULI21E3Ck2Fo8EJXaVfwgw,5812
|
|
22
|
+
test/integration/connectors/elasticsearch/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
23
|
+
test/integration/connectors/elasticsearch/conftest.py,sha256=-i4_7MkIxSQENz7nuD2uHuhGU9mZ33vpeTPhHtRpQfs,989
|
|
24
|
+
test/integration/connectors/elasticsearch/test_elasticsearch.py,sha256=nqdHwBpvgk_74orzDaQIKALK5cb0YloxSdt7QDJX0r0,11169
|
|
25
|
+
test/integration/connectors/elasticsearch/test_opensearch.py,sha256=Rk4tQ_Qv5icycDWMUpnzTbg-QzwGyb6nKqB0gDef9D0,10555
|
|
19
26
|
test/integration/connectors/sql/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
20
|
-
test/integration/connectors/sql/test_postgres.py,sha256=
|
|
21
|
-
test/integration/connectors/sql/test_singlestore.py,sha256=
|
|
22
|
-
test/integration/connectors/sql/test_snowflake.py,sha256=
|
|
23
|
-
test/integration/connectors/sql/test_sqlite.py,sha256=
|
|
27
|
+
test/integration/connectors/sql/test_postgres.py,sha256=lrymDI7bVX_4qij5gsUc_bTvHPeelu6hpJemQ6WWmlY,6783
|
|
28
|
+
test/integration/connectors/sql/test_singlestore.py,sha256=iCp9q6tzhNIUCUubCPiRKj6VmJnwot4JGo9fkkTHg_U,5960
|
|
29
|
+
test/integration/connectors/sql/test_snowflake.py,sha256=DqQIV9H5Uv7HaHtDyrAPdqefd316oVt5lKtdJ2Zdk6Q,7082
|
|
30
|
+
test/integration/connectors/sql/test_sqlite.py,sha256=gSfp2hXAb5BGknzZXVa7K5bBwEb5Li4k5493mQCFjBQ,5719
|
|
24
31
|
test/integration/connectors/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
25
32
|
test/integration/connectors/utils/constants.py,sha256=0zSPnsZVqJuNhXduXvdXFQLZTRIQa5Fo_1qjBYVCfb8,209
|
|
26
|
-
test/integration/connectors/utils/docker.py,sha256=
|
|
33
|
+
test/integration/connectors/utils/docker.py,sha256=lnSjRgYoQa5c5nBdg2eLkB8KJVOjk4eyqq_C6PtTkME,4806
|
|
27
34
|
test/integration/connectors/utils/docker_compose.py,sha256=GVTB6Cel05c0VQ2n4AwkQQx_cBfz13ZTs1HpbaYipNU,2223
|
|
28
|
-
test/integration/connectors/utils/validation.py,sha256=
|
|
35
|
+
test/integration/connectors/utils/validation.py,sha256=SwvPVuHjJxTo8xEUwnuL9FZNpu3sZZ8iouOz5xh_kB8,14272
|
|
36
|
+
test/integration/connectors/weaviate/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
37
|
+
test/integration/connectors/weaviate/conftest.py,sha256=6Q6QdrLJmGHowRFSmoVSzup2EX6qASfS2Z5tqlpTm9M,387
|
|
38
|
+
test/integration/connectors/weaviate/test_local.py,sha256=SK6iEwQUKiCd0X99BEk8GlQoLaCcJcFPt09NN526Ct0,4508
|
|
29
39
|
test/integration/embedders/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
30
40
|
test/integration/embedders/conftest.py,sha256=B2W771RbijR7G_GybsCzRyIvOzXqzbKZdRIlNDd5AGY,334
|
|
31
41
|
test/integration/embedders/test_bedrock.py,sha256=0oBRNS_DtFDGQ22Z1T3t6VOJ31PrItgvnJpqcLe9Fg4,1903
|
|
@@ -71,7 +81,7 @@ test/unit/v2/partitioners/test_partitioner.py,sha256=iIYg7IpftV3LusoO4H8tr1IHY1U
|
|
|
71
81
|
test/unit/v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
72
82
|
test/unit/v2/utils/data_generator.py,sha256=UoYVNjG4S4wlaA9gceQ82HIpF9_6I1UTHD1_GrQBHp0,973
|
|
73
83
|
unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
|
|
74
|
-
unstructured_ingest/__version__.py,sha256=
|
|
84
|
+
unstructured_ingest/__version__.py,sha256=Js7MXQhyIj1akVjPNsLkmZxqoOHDGOr2opEPgFOSTZQ,42
|
|
75
85
|
unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
|
|
76
86
|
unstructured_ingest/interfaces.py,sha256=OYVUP0bzBJpT-Lz92BDyz_hLBvyfxkuSwWHhUdnUayA,31493
|
|
77
87
|
unstructured_ingest/logger.py,sha256=S5nSqGcABoQyeicgRnBQFjDScCaTvFVivOCvbo-laL0,4479
|
|
@@ -267,7 +277,7 @@ unstructured_ingest/pipeline/utils.py,sha256=RNx4bv2FhKOhaK_YTiRubta7n9wmJwqzznF
|
|
|
267
277
|
unstructured_ingest/pipeline/write.py,sha256=xmDjmbieGRrcI342he7PkgxWaMoSJ5nWPmP5AM2xloU,669
|
|
268
278
|
unstructured_ingest/pipeline/reformat/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
269
279
|
unstructured_ingest/pipeline/reformat/chunking.py,sha256=vbJgi2Yl9Rd9yZxIf64Nxj6cjUJnJWRpDCagswQmrLw,6040
|
|
270
|
-
unstructured_ingest/pipeline/reformat/embedding.py,sha256=
|
|
280
|
+
unstructured_ingest/pipeline/reformat/embedding.py,sha256=vyRgrNvz50eMOCO00YdV9ODK0LRIB3_NF6t1mWD01uc,2525
|
|
271
281
|
unstructured_ingest/runner/__init__.py,sha256=FO0X_jBIMilXdyjBajyFmzHoC3eVypNMGlhdOW4mcCM,2859
|
|
272
282
|
unstructured_ingest/runner/airtable.py,sha256=1ndJ6PKT63E0gZN3KYFBj4Yo94zQYsIvSjC6ro2nIPE,1115
|
|
273
283
|
unstructured_ingest/runner/astradb.py,sha256=FSBtQrsdC9E3eHUcAuQ0apcCnWolz-9tkvy-Uf7QeKg,1102
|
|
@@ -329,7 +339,7 @@ unstructured_ingest/runner/writers/fsspec/s3.py,sha256=kHJq2O3864QBd_tL2SKb0mdyw
|
|
|
329
339
|
unstructured_ingest/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
330
340
|
unstructured_ingest/utils/chunking.py,sha256=efWEfMcCukG5zASZrXhkNgAX8AzHa6t3rClMzm2TwFE,1521
|
|
331
341
|
unstructured_ingest/utils/compression.py,sha256=NNiY-2S2Gf3at7zC1PYxMijaEza9vVSzRn5mdFf6mHo,4434
|
|
332
|
-
unstructured_ingest/utils/data_prep.py,sha256=
|
|
342
|
+
unstructured_ingest/utils/data_prep.py,sha256=IDAedOSBdgZpD9IY4tLJT-rmKGV7GHtU6KRj6VM-_tE,4666
|
|
333
343
|
unstructured_ingest/utils/dep_check.py,sha256=SXXcUna2H0RtxA6j1S2NGkvQa9JP2DujWhmyBa7776Y,2400
|
|
334
344
|
unstructured_ingest/utils/google_filetype.py,sha256=YVspEkiiBrRUSGVeVbsavvLvTmizdy2e6TsjigXTSRU,468
|
|
335
345
|
unstructured_ingest/utils/string_and_date_utils.py,sha256=LwcbLmWpwt1zEabLlyUd5kIf9oOWcZxsRzxDglLCMeU,1375
|
|
@@ -382,36 +392,36 @@ unstructured_ingest/v2/processes/embedder.py,sha256=PQn0IO8xbGRQHpcT2VVl-J8gTJ5H
|
|
|
382
392
|
unstructured_ingest/v2/processes/filter.py,sha256=kjUmMw2SDq2bme0JCAOxs6cJriIG6Ty09KOznS-xz08,2145
|
|
383
393
|
unstructured_ingest/v2/processes/partitioner.py,sha256=agpHwB9FR8OZVQqE7zFEb0IcDPCOPA_BZjLzLF71nOY,8194
|
|
384
394
|
unstructured_ingest/v2/processes/uncompress.py,sha256=Z_XfsITGdyaRwhtNUc7bMj5Y2jLuBge8KoK4nxhqKag,2425
|
|
385
|
-
unstructured_ingest/v2/processes/connectors/__init__.py,sha256=
|
|
395
|
+
unstructured_ingest/v2/processes/connectors/__init__.py,sha256=8M3aYYNbOkS2SYG2B_HLHMgX4V69-Oz1VqpQcRQMiVg,5167
|
|
386
396
|
unstructured_ingest/v2/processes/connectors/airtable.py,sha256=eeZJe-bBNxt5Sa-XEFCdcGeJCguJU5WN2Mv9kLp5dVQ,8917
|
|
387
|
-
unstructured_ingest/v2/processes/connectors/astradb.py,sha256=
|
|
388
|
-
unstructured_ingest/v2/processes/connectors/azure_ai_search.py,sha256
|
|
397
|
+
unstructured_ingest/v2/processes/connectors/astradb.py,sha256=zsIElFNNqVCXcLqBw6C8bRoyPQDrGNPkTWeA0FYYO94,14703
|
|
398
|
+
unstructured_ingest/v2/processes/connectors/azure_ai_search.py,sha256=-6IijSWGqj-85vD0c4l5wdMHp-LF371jO8j53PPRB4I,12002
|
|
389
399
|
unstructured_ingest/v2/processes/connectors/chroma.py,sha256=skrxRPHZ8y3JxNa0dt5SVitHiDQ5WVxLvY_kh2-QUrQ,8029
|
|
390
400
|
unstructured_ingest/v2/processes/connectors/confluence.py,sha256=qQApDcmPBGg4tHXwSOj4JPkAbrO9GQ4NRlaETjhp25U,7003
|
|
391
401
|
unstructured_ingest/v2/processes/connectors/couchbase.py,sha256=yhMDbpkZXs-Kis7tFlgjvNemU-MdWMdpCZDrpZNFaU4,12180
|
|
392
|
-
unstructured_ingest/v2/processes/connectors/delta_table.py,sha256=
|
|
393
|
-
unstructured_ingest/v2/processes/connectors/elasticsearch.py,sha256=lNvUbbTMv2ZKxRN6cesfD2AeQc1kQG9AKqY9RHBfVXs,16796
|
|
402
|
+
unstructured_ingest/v2/processes/connectors/delta_table.py,sha256=1yS7ivEyiucwd_kv6LL5HQdGabT43yeG6XCdwiz89hc,8019
|
|
394
403
|
unstructured_ingest/v2/processes/connectors/gitlab.py,sha256=yBgCeLy9iCVI8bBDcHHuHB0H3BO05e9E1OccbHwvKAo,9724
|
|
395
|
-
unstructured_ingest/v2/processes/connectors/google_drive.py,sha256=
|
|
404
|
+
unstructured_ingest/v2/processes/connectors/google_drive.py,sha256=EEwXK1Anlu-eXl5qxmdDIqPYW7eMSez6WGlTPG2vSn8,13121
|
|
396
405
|
unstructured_ingest/v2/processes/connectors/kdbai.py,sha256=8bGHbZctJ_Tl1AUSMnI7CCZ7CgEtTRVcRuvlB1HPlqQ,5907
|
|
397
406
|
unstructured_ingest/v2/processes/connectors/local.py,sha256=a3stgnIkhBbXPIQD0O-RaRM-Eb-szHj9Yy4Fz881-9c,6723
|
|
398
|
-
unstructured_ingest/v2/processes/connectors/milvus.py,sha256=
|
|
399
|
-
unstructured_ingest/v2/processes/connectors/mongodb.py,sha256=
|
|
407
|
+
unstructured_ingest/v2/processes/connectors/milvus.py,sha256=Bzv2fa852BcM4_Pr-I_DPvLmjPoXv0Z7BeEA8qSKCDc,9725
|
|
408
|
+
unstructured_ingest/v2/processes/connectors/mongodb.py,sha256=XLuprTCY0D9tAh_qn81MjJrDN9YaNqMlKe7BJl3eTZc,14998
|
|
400
409
|
unstructured_ingest/v2/processes/connectors/onedrive.py,sha256=heZMtOIrCySi552ldIk8iH0pSRXZ0W2LeD-CcNOwCFQ,15979
|
|
401
|
-
unstructured_ingest/v2/processes/connectors/opensearch.py,sha256=dfDSNrWIEk19wuHdlMJpp_SLMOteNPlkDBPlAwu1LVY,6767
|
|
402
410
|
unstructured_ingest/v2/processes/connectors/outlook.py,sha256=KgNGM8hImRhy6_SpswRP2VwRD4VOrqqJoySgxf2oduI,9290
|
|
403
|
-
unstructured_ingest/v2/processes/connectors/pinecone.py,sha256=
|
|
411
|
+
unstructured_ingest/v2/processes/connectors/pinecone.py,sha256=hWkXgVDAzCtrBxf7A4HoexBACGAfVf_Qvn9YHbeiBSY,11505
|
|
404
412
|
unstructured_ingest/v2/processes/connectors/salesforce.py,sha256=2CiO2ZZiZ1Y1-nB7wcDlDVcpW2B7ut9wCj66rkkqho0,11616
|
|
405
413
|
unstructured_ingest/v2/processes/connectors/sharepoint.py,sha256=Ndn2Wm7RupfjAtlLxxQwJueeE0V8aGMbNVPuFq9nqdQ,19730
|
|
406
414
|
unstructured_ingest/v2/processes/connectors/slack.py,sha256=Z73VmQ3oUY09KoLEi5OBdQeDt4ONEY_02SglWQc6HXE,9252
|
|
407
415
|
unstructured_ingest/v2/processes/connectors/utils.py,sha256=8kd0g7lo9NqnpaIkjeO-Ut6erhwUNH_gS9koevpe3WE,878
|
|
408
|
-
unstructured_ingest/v2/processes/connectors/weaviate.py,sha256=Ss0YyD5T6k-00eJ6dr5lSo2H0LcOjVTMmozehyTvnAo,8866
|
|
409
416
|
unstructured_ingest/v2/processes/connectors/databricks/__init__.py,sha256=jO71UTC7bLA_N12CrLWJzh_yZML5gfT7VohxzCpUGWg,1848
|
|
410
417
|
unstructured_ingest/v2/processes/connectors/databricks/volumes.py,sha256=p7sjCYZb7JmY3v3Xy1gm-q0O7oamLTsSFf2EWXYfXYQ,6447
|
|
411
418
|
unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py,sha256=I1MJwe5LOxoPLjwo00H0XbXO6u_SJHWYgsj4s6ePoyI,2754
|
|
412
419
|
unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py,sha256=P4rfcE3td7WyuuguRgUnGQytCMDpfeYrrpshBZuVynY,3539
|
|
413
420
|
unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py,sha256=UUotY_-HpgSEJkvdQfZTlbxY7CRLZ4ctL8TlryeFvxk,2790
|
|
414
421
|
unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py,sha256=Wk7s2_u5G0BOV5slvGc8IlUf7ivznY9PrgPqe6nlJKM,2897
|
|
422
|
+
unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py,sha256=Zzc0JNPP-eFqpwWw1Gp-XC8H-s__IgkYKzoagECycZY,829
|
|
423
|
+
unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py,sha256=lzbrQ66zz3Dh_G29XFkyzQ84St8H_xfQVsYV4mTf32c,19141
|
|
424
|
+
unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py,sha256=qRz8Fyr2RSZIPZGkhPeme6AZxM0aX-c_xOa1ZtSr2Kg,6781
|
|
415
425
|
unstructured_ingest/v2/processes/connectors/fsspec/__init__.py,sha256=TtdeImM7Ypl_n6sl7I1JqX6bGSG0t_FqvCqE3Cy24og,1846
|
|
416
426
|
unstructured_ingest/v2/processes/connectors/fsspec/azure.py,sha256=Y01BuVRql0Kvzc_cdaZE9dDGYjJzrwJu-etfUrEGcUU,7061
|
|
417
427
|
unstructured_ingest/v2/processes/connectors/fsspec/box.py,sha256=Cjk0LUxqOCDbme0GmnD_5_b1hfStjI23cKw6BquKNrg,5488
|
|
@@ -423,8 +433,14 @@ unstructured_ingest/v2/processes/connectors/fsspec/sftp.py,sha256=dwpyqDq0qceCBW
|
|
|
423
433
|
unstructured_ingest/v2/processes/connectors/fsspec/utils.py,sha256=jec_Qfe2hbfahBuY-u8FnvHuv933AI5HwPFjOL3kEEY,456
|
|
424
434
|
unstructured_ingest/v2/processes/connectors/kafka/__init__.py,sha256=mQJ9Ex-QCfhz-BB5YWTfbPf7xGLd1i7FpjRr0ukbhNw,754
|
|
425
435
|
unstructured_ingest/v2/processes/connectors/kafka/cloud.py,sha256=qprsfI8VH0mVTa1MOCpa2D4coyopinQ5ag2KXcAecXE,3296
|
|
426
|
-
unstructured_ingest/v2/processes/connectors/kafka/kafka.py,sha256=
|
|
436
|
+
unstructured_ingest/v2/processes/connectors/kafka/kafka.py,sha256=qEv_yaG94KekFtfS06KgpTTbqeJkje0hn5uOjsMMngw,9414
|
|
427
437
|
unstructured_ingest/v2/processes/connectors/kafka/local.py,sha256=vwLZjvc_C17zOqcrzic0aIoPwS98sqYiwiMknw2IcK4,2586
|
|
438
|
+
unstructured_ingest/v2/processes/connectors/lancedb/__init__.py,sha256=lHUPCOiyOGu1IME1QiyFBZaB8z8e3bP8Y8TkqKs32Qk,906
|
|
439
|
+
unstructured_ingest/v2/processes/connectors/lancedb/aws.py,sha256=yR8V4O-oI_nUKJtHTLxhteEJpPDPn-_d2IkkXvgThJ0,1406
|
|
440
|
+
unstructured_ingest/v2/processes/connectors/lancedb/azure.py,sha256=Ms5vQVRIpTF1Q2qBl_bET9wbgaf4diPaH-iR8kJlr4E,1461
|
|
441
|
+
unstructured_ingest/v2/processes/connectors/lancedb/gcp.py,sha256=p5BPaFtS3y3Yh8PIr3tUqsAXrUYu4QYYAWQNh5W2ucE,1361
|
|
442
|
+
unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py,sha256=7WIShs2V3dpN6wUhDTt1j2rvdiPp6yopbh7XYkb9T3s,5129
|
|
443
|
+
unstructured_ingest/v2/processes/connectors/lancedb/local.py,sha256=_7-6iO6B60gAWwJUUrmlsRzYMFIBeZgu_QT3mhw5L0I,1272
|
|
428
444
|
unstructured_ingest/v2/processes/connectors/qdrant/__init__.py,sha256=xM19uYzAuGizVoZIM_hnVZ5AcBN69aOBGpqZcpWPtuE,760
|
|
429
445
|
unstructured_ingest/v2/processes/connectors/qdrant/cloud.py,sha256=accJ4sNWBVWV-KiVBDBDBYYx5A9CUoikP5NCErRmfik,1624
|
|
430
446
|
unstructured_ingest/v2/processes/connectors/qdrant/local.py,sha256=cGEyv3Oy6y4BQ4DU8yhJWMpL82QYwBVdPTxxNuV127U,1588
|
|
@@ -434,11 +450,16 @@ unstructured_ingest/v2/processes/connectors/sql/__init__.py,sha256=D43wrV2ADvQsT
|
|
|
434
450
|
unstructured_ingest/v2/processes/connectors/sql/postgres.py,sha256=rHaSb1MtdWMY6eQL2i2cWSL4w0VApFTChzmWtyfvFTI,5140
|
|
435
451
|
unstructured_ingest/v2/processes/connectors/sql/singlestore.py,sha256=YrmhAL1RQ1c5-2fnR3UAyj_4KfvjYTQ2cWzpvsdJOnU,5535
|
|
436
452
|
unstructured_ingest/v2/processes/connectors/sql/snowflake.py,sha256=jl524VudwmFK63emCT7DmZan_EWJAMiGir5_zoO9FuY,5697
|
|
437
|
-
unstructured_ingest/v2/processes/connectors/sql/sql.py,sha256=
|
|
453
|
+
unstructured_ingest/v2/processes/connectors/sql/sql.py,sha256=LFzGeAUagLknK07DsXg2oSG7ZAgR6VqT9wfI_tYlHUg,14782
|
|
438
454
|
unstructured_ingest/v2/processes/connectors/sql/sqlite.py,sha256=9605K36nQ5-gBxzt1daYKYotON1SE85RETusqCJrbdk,5230
|
|
439
|
-
unstructured_ingest
|
|
440
|
-
unstructured_ingest
|
|
441
|
-
unstructured_ingest
|
|
442
|
-
unstructured_ingest
|
|
443
|
-
unstructured_ingest
|
|
444
|
-
unstructured_ingest-0.3.
|
|
455
|
+
unstructured_ingest/v2/processes/connectors/weaviate/__init__.py,sha256=eXamSnQdzzMvt62z80B8nmlkwDKO-Pogln_K_zLz53A,1067
|
|
456
|
+
unstructured_ingest/v2/processes/connectors/weaviate/cloud.py,sha256=2g1Fm2J0ppfy2jCw4b5YtrsWrSD3VcrAaqiE7FlpIAg,6236
|
|
457
|
+
unstructured_ingest/v2/processes/connectors/weaviate/embedded.py,sha256=S8Zg8StuZT-k7tCg1D5YShO1-vJYYk9-M1bE1fIqx64,3014
|
|
458
|
+
unstructured_ingest/v2/processes/connectors/weaviate/local.py,sha256=LuTBKPseVewsz8VqxRPRLfGEm3BeI9nBZxpy7ZU5tOA,2201
|
|
459
|
+
unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py,sha256=ln1p9ahFTaT-qsL7p4bgw_IqnU60As_l6vVAqUWyQVE,11655
|
|
460
|
+
unstructured_ingest-0.3.2.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
|
|
461
|
+
unstructured_ingest-0.3.2.dist-info/METADATA,sha256=rqTWqewB8eIrgrHJ-8AsNtehy35eSHKseCsveXTwN3Y,7326
|
|
462
|
+
unstructured_ingest-0.3.2.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
|
|
463
|
+
unstructured_ingest-0.3.2.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
|
|
464
|
+
unstructured_ingest-0.3.2.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
|
|
465
|
+
unstructured_ingest-0.3.2.dist-info/RECORD,,
|
|
@@ -1,242 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
from dataclasses import dataclass, field
|
|
3
|
-
from datetime import date, datetime
|
|
4
|
-
from pathlib import Path
|
|
5
|
-
from typing import TYPE_CHECKING, Any, Optional
|
|
6
|
-
|
|
7
|
-
from dateutil import parser
|
|
8
|
-
from pydantic import Field, Secret
|
|
9
|
-
|
|
10
|
-
from unstructured_ingest.error import DestinationConnectionError
|
|
11
|
-
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
12
|
-
from unstructured_ingest.v2.interfaces import (
|
|
13
|
-
AccessConfig,
|
|
14
|
-
ConnectionConfig,
|
|
15
|
-
FileData,
|
|
16
|
-
Uploader,
|
|
17
|
-
UploaderConfig,
|
|
18
|
-
UploadStager,
|
|
19
|
-
UploadStagerConfig,
|
|
20
|
-
)
|
|
21
|
-
from unstructured_ingest.v2.logger import logger
|
|
22
|
-
from unstructured_ingest.v2.processes.connector_registry import (
|
|
23
|
-
DestinationRegistryEntry,
|
|
24
|
-
)
|
|
25
|
-
|
|
26
|
-
if TYPE_CHECKING:
|
|
27
|
-
from weaviate import Client
|
|
28
|
-
|
|
29
|
-
CONNECTOR_TYPE = "weaviate"
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
class WeaviateAccessConfig(AccessConfig):
|
|
33
|
-
access_token: Optional[str] = Field(
|
|
34
|
-
default=None, description="Used to create the bearer token."
|
|
35
|
-
)
|
|
36
|
-
api_key: Optional[str] = None
|
|
37
|
-
client_secret: Optional[str] = None
|
|
38
|
-
password: Optional[str] = None
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
class WeaviateConnectionConfig(ConnectionConfig):
|
|
42
|
-
host_url: str = Field(description="Weaviate instance url")
|
|
43
|
-
class_name: str = Field(
|
|
44
|
-
description="Name of the class to push the records into, e.g: Pdf-elements"
|
|
45
|
-
)
|
|
46
|
-
access_config: Secret[WeaviateAccessConfig] = Field(
|
|
47
|
-
default=WeaviateAccessConfig(), validate_default=True
|
|
48
|
-
)
|
|
49
|
-
username: Optional[str] = None
|
|
50
|
-
anonymous: bool = Field(default=False, description="if set, all auth values will be ignored")
|
|
51
|
-
scope: Optional[list[str]] = None
|
|
52
|
-
refresh_token: Optional[str] = Field(
|
|
53
|
-
default=None,
|
|
54
|
-
description="Will tie this value to the bearer token. If not provided, "
|
|
55
|
-
"the authentication will expire once the lifetime of the access token is up.",
|
|
56
|
-
)
|
|
57
|
-
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
class WeaviateUploadStagerConfig(UploadStagerConfig):
|
|
61
|
-
pass
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
@dataclass
|
|
65
|
-
class WeaviateUploadStager(UploadStager):
|
|
66
|
-
upload_stager_config: WeaviateUploadStagerConfig = field(
|
|
67
|
-
default_factory=lambda: WeaviateUploadStagerConfig()
|
|
68
|
-
)
|
|
69
|
-
|
|
70
|
-
@staticmethod
|
|
71
|
-
def parse_date_string(date_string: str) -> date:
|
|
72
|
-
try:
|
|
73
|
-
timestamp = float(date_string)
|
|
74
|
-
return datetime.fromtimestamp(timestamp)
|
|
75
|
-
except Exception as e:
|
|
76
|
-
logger.debug(f"date {date_string} string not a timestamp: {e}")
|
|
77
|
-
return parser.parse(date_string)
|
|
78
|
-
|
|
79
|
-
@classmethod
|
|
80
|
-
def conform_dict(cls, data: dict) -> None:
|
|
81
|
-
"""
|
|
82
|
-
Updates the element dictionary to conform to the Weaviate schema
|
|
83
|
-
"""
|
|
84
|
-
|
|
85
|
-
# Dict as string formatting
|
|
86
|
-
if record_locator := data.get("metadata", {}).get("data_source", {}).get("record_locator"):
|
|
87
|
-
# Explicit casting otherwise fails schema type checking
|
|
88
|
-
data["metadata"]["data_source"]["record_locator"] = str(json.dumps(record_locator))
|
|
89
|
-
|
|
90
|
-
# Array of items as string formatting
|
|
91
|
-
if points := data.get("metadata", {}).get("coordinates", {}).get("points"):
|
|
92
|
-
data["metadata"]["coordinates"]["points"] = str(json.dumps(points))
|
|
93
|
-
|
|
94
|
-
if links := data.get("metadata", {}).get("links", {}):
|
|
95
|
-
data["metadata"]["links"] = str(json.dumps(links))
|
|
96
|
-
|
|
97
|
-
if permissions_data := (
|
|
98
|
-
data.get("metadata", {}).get("data_source", {}).get("permissions_data")
|
|
99
|
-
):
|
|
100
|
-
data["metadata"]["data_source"]["permissions_data"] = json.dumps(permissions_data)
|
|
101
|
-
|
|
102
|
-
# Datetime formatting
|
|
103
|
-
if date_created := data.get("metadata", {}).get("data_source", {}).get("date_created"):
|
|
104
|
-
data["metadata"]["data_source"]["date_created"] = cls.parse_date_string(
|
|
105
|
-
date_created
|
|
106
|
-
).strftime(
|
|
107
|
-
"%Y-%m-%dT%H:%M:%S.%fZ",
|
|
108
|
-
)
|
|
109
|
-
|
|
110
|
-
if date_modified := data.get("metadata", {}).get("data_source", {}).get("date_modified"):
|
|
111
|
-
data["metadata"]["data_source"]["date_modified"] = cls.parse_date_string(
|
|
112
|
-
date_modified
|
|
113
|
-
).strftime(
|
|
114
|
-
"%Y-%m-%dT%H:%M:%S.%fZ",
|
|
115
|
-
)
|
|
116
|
-
|
|
117
|
-
if date_processed := data.get("metadata", {}).get("data_source", {}).get("date_processed"):
|
|
118
|
-
data["metadata"]["data_source"]["date_processed"] = cls.parse_date_string(
|
|
119
|
-
date_processed
|
|
120
|
-
).strftime(
|
|
121
|
-
"%Y-%m-%dT%H:%M:%S.%fZ",
|
|
122
|
-
)
|
|
123
|
-
|
|
124
|
-
if last_modified := data.get("metadata", {}).get("last_modified"):
|
|
125
|
-
data["metadata"]["last_modified"] = cls.parse_date_string(last_modified).strftime(
|
|
126
|
-
"%Y-%m-%dT%H:%M:%S.%fZ",
|
|
127
|
-
)
|
|
128
|
-
|
|
129
|
-
# String casting
|
|
130
|
-
if version := data.get("metadata", {}).get("data_source", {}).get("version"):
|
|
131
|
-
data["metadata"]["data_source"]["version"] = str(version)
|
|
132
|
-
|
|
133
|
-
if page_number := data.get("metadata", {}).get("page_number"):
|
|
134
|
-
data["metadata"]["page_number"] = str(page_number)
|
|
135
|
-
|
|
136
|
-
if regex_metadata := data.get("metadata", {}).get("regex_metadata"):
|
|
137
|
-
data["metadata"]["regex_metadata"] = str(json.dumps(regex_metadata))
|
|
138
|
-
|
|
139
|
-
def run(
|
|
140
|
-
self,
|
|
141
|
-
elements_filepath: Path,
|
|
142
|
-
file_data: FileData,
|
|
143
|
-
output_dir: Path,
|
|
144
|
-
output_filename: str,
|
|
145
|
-
**kwargs: Any,
|
|
146
|
-
) -> Path:
|
|
147
|
-
with open(elements_filepath) as elements_file:
|
|
148
|
-
elements_contents = json.load(elements_file)
|
|
149
|
-
for element in elements_contents:
|
|
150
|
-
self.conform_dict(data=element)
|
|
151
|
-
output_path = Path(output_dir) / Path(f"{output_filename}.json")
|
|
152
|
-
with open(output_path, "w") as output_file:
|
|
153
|
-
json.dump(elements_contents, output_file)
|
|
154
|
-
return output_path
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
class WeaviateUploaderConfig(UploaderConfig):
|
|
158
|
-
batch_size: int = Field(default=100, description="Number of records per batch")
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
@dataclass
|
|
162
|
-
class WeaviateUploader(Uploader):
|
|
163
|
-
upload_config: WeaviateUploaderConfig
|
|
164
|
-
connection_config: WeaviateConnectionConfig
|
|
165
|
-
connector_type: str = CONNECTOR_TYPE
|
|
166
|
-
|
|
167
|
-
@requires_dependencies(["weaviate"], extras="weaviate")
|
|
168
|
-
def get_client(self) -> "Client":
|
|
169
|
-
from weaviate import Client
|
|
170
|
-
|
|
171
|
-
auth = self._resolve_auth_method()
|
|
172
|
-
return Client(url=self.connection_config.host_url, auth_client_secret=auth)
|
|
173
|
-
|
|
174
|
-
def precheck(self) -> None:
|
|
175
|
-
try:
|
|
176
|
-
self.get_client()
|
|
177
|
-
except Exception as e:
|
|
178
|
-
logger.error(f"Failed to validate connection {e}", exc_info=True)
|
|
179
|
-
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
180
|
-
|
|
181
|
-
@requires_dependencies(["weaviate"], extras="weaviate")
|
|
182
|
-
def _resolve_auth_method(self):
|
|
183
|
-
access_configs = self.connection_config.access_config.get_secret_value()
|
|
184
|
-
connection_config = self.connection_config
|
|
185
|
-
if connection_config.anonymous:
|
|
186
|
-
return None
|
|
187
|
-
|
|
188
|
-
if access_configs.access_token:
|
|
189
|
-
from weaviate.auth import AuthBearerToken
|
|
190
|
-
|
|
191
|
-
return AuthBearerToken(
|
|
192
|
-
access_token=access_configs.access_token,
|
|
193
|
-
refresh_token=connection_config.refresh_token,
|
|
194
|
-
)
|
|
195
|
-
elif access_configs.api_key:
|
|
196
|
-
from weaviate.auth import AuthApiKey
|
|
197
|
-
|
|
198
|
-
return AuthApiKey(api_key=access_configs.api_key)
|
|
199
|
-
elif access_configs.client_secret:
|
|
200
|
-
from weaviate.auth import AuthClientCredentials
|
|
201
|
-
|
|
202
|
-
return AuthClientCredentials(
|
|
203
|
-
client_secret=access_configs.client_secret, scope=connection_config.scope
|
|
204
|
-
)
|
|
205
|
-
elif connection_config.username and access_configs.password:
|
|
206
|
-
from weaviate.auth import AuthClientPassword
|
|
207
|
-
|
|
208
|
-
return AuthClientPassword(
|
|
209
|
-
username=connection_config.username,
|
|
210
|
-
password=access_configs.password,
|
|
211
|
-
scope=connection_config.scope,
|
|
212
|
-
)
|
|
213
|
-
return None
|
|
214
|
-
|
|
215
|
-
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
216
|
-
with path.open("r") as file:
|
|
217
|
-
elements_dict = json.load(file)
|
|
218
|
-
logger.info(
|
|
219
|
-
f"writing {len(elements_dict)} objects to destination "
|
|
220
|
-
f"class {self.connection_config.class_name} "
|
|
221
|
-
f"at {self.connection_config.host_url}",
|
|
222
|
-
)
|
|
223
|
-
|
|
224
|
-
client = self.get_client()
|
|
225
|
-
client.batch.configure(batch_size=self.upload_config.batch_size)
|
|
226
|
-
with client.batch as b:
|
|
227
|
-
for e in elements_dict:
|
|
228
|
-
vector = e.pop("embeddings", None)
|
|
229
|
-
b.add_data_object(
|
|
230
|
-
e,
|
|
231
|
-
self.connection_config.class_name,
|
|
232
|
-
vector=vector,
|
|
233
|
-
)
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
weaviate_destination_entry = DestinationRegistryEntry(
|
|
237
|
-
connection_config=WeaviateConnectionConfig,
|
|
238
|
-
uploader=WeaviateUploader,
|
|
239
|
-
uploader_config=WeaviateUploaderConfig,
|
|
240
|
-
upload_stager=WeaviateUploadStager,
|
|
241
|
-
upload_stager_config=WeaviateUploadStagerConfig,
|
|
242
|
-
)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{unstructured_ingest-0.3.0.dist-info → unstructured_ingest-0.3.2.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|