unstructured-ingest 0.3.0__py3-none-any.whl → 0.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (55) hide show
  1. test/integration/connectors/elasticsearch/__init__.py +0 -0
  2. test/integration/connectors/elasticsearch/conftest.py +34 -0
  3. test/integration/connectors/elasticsearch/test_elasticsearch.py +308 -0
  4. test/integration/connectors/elasticsearch/test_opensearch.py +302 -0
  5. test/integration/connectors/sql/test_postgres.py +10 -4
  6. test/integration/connectors/sql/test_singlestore.py +8 -4
  7. test/integration/connectors/sql/test_snowflake.py +10 -6
  8. test/integration/connectors/sql/test_sqlite.py +4 -4
  9. test/integration/connectors/test_astradb.py +50 -3
  10. test/integration/connectors/test_delta_table.py +46 -0
  11. test/integration/connectors/test_kafka.py +40 -6
  12. test/integration/connectors/test_lancedb.py +210 -0
  13. test/integration/connectors/test_milvus.py +141 -0
  14. test/integration/connectors/test_mongodb.py +332 -0
  15. test/integration/connectors/test_pinecone.py +53 -1
  16. test/integration/connectors/utils/docker.py +81 -15
  17. test/integration/connectors/utils/validation.py +10 -0
  18. test/integration/connectors/weaviate/__init__.py +0 -0
  19. test/integration/connectors/weaviate/conftest.py +15 -0
  20. test/integration/connectors/weaviate/test_local.py +131 -0
  21. unstructured_ingest/__version__.py +1 -1
  22. unstructured_ingest/pipeline/reformat/embedding.py +1 -1
  23. unstructured_ingest/utils/data_prep.py +9 -1
  24. unstructured_ingest/v2/processes/connectors/__init__.py +3 -16
  25. unstructured_ingest/v2/processes/connectors/astradb.py +2 -2
  26. unstructured_ingest/v2/processes/connectors/azure_ai_search.py +4 -0
  27. unstructured_ingest/v2/processes/connectors/delta_table.py +20 -4
  28. unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py +19 -0
  29. unstructured_ingest/v2/processes/connectors/{elasticsearch.py → elasticsearch/elasticsearch.py} +92 -46
  30. unstructured_ingest/v2/processes/connectors/{opensearch.py → elasticsearch/opensearch.py} +1 -1
  31. unstructured_ingest/v2/processes/connectors/google_drive.py +1 -1
  32. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +6 -0
  33. unstructured_ingest/v2/processes/connectors/lancedb/__init__.py +17 -0
  34. unstructured_ingest/v2/processes/connectors/lancedb/aws.py +43 -0
  35. unstructured_ingest/v2/processes/connectors/lancedb/azure.py +43 -0
  36. unstructured_ingest/v2/processes/connectors/lancedb/gcp.py +44 -0
  37. unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +161 -0
  38. unstructured_ingest/v2/processes/connectors/lancedb/local.py +44 -0
  39. unstructured_ingest/v2/processes/connectors/milvus.py +72 -27
  40. unstructured_ingest/v2/processes/connectors/mongodb.py +122 -111
  41. unstructured_ingest/v2/processes/connectors/pinecone.py +24 -7
  42. unstructured_ingest/v2/processes/connectors/sql/sql.py +97 -26
  43. unstructured_ingest/v2/processes/connectors/weaviate/__init__.py +25 -0
  44. unstructured_ingest/v2/processes/connectors/weaviate/cloud.py +164 -0
  45. unstructured_ingest/v2/processes/connectors/weaviate/embedded.py +90 -0
  46. unstructured_ingest/v2/processes/connectors/weaviate/local.py +73 -0
  47. unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +299 -0
  48. {unstructured_ingest-0.3.0.dist-info → unstructured_ingest-0.3.2.dist-info}/METADATA +19 -19
  49. {unstructured_ingest-0.3.0.dist-info → unstructured_ingest-0.3.2.dist-info}/RECORD +54 -33
  50. unstructured_ingest/v2/processes/connectors/weaviate.py +0 -242
  51. /test/integration/connectors/{test_azure_cog_search.py → test_azure_ai_search.py} +0 -0
  52. {unstructured_ingest-0.3.0.dist-info → unstructured_ingest-0.3.2.dist-info}/LICENSE.md +0 -0
  53. {unstructured_ingest-0.3.0.dist-info → unstructured_ingest-0.3.2.dist-info}/WHEEL +0 -0
  54. {unstructured_ingest-0.3.0.dist-info → unstructured_ingest-0.3.2.dist-info}/entry_points.txt +0 -0
  55. {unstructured_ingest-0.3.0.dist-info → unstructured_ingest-0.3.2.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: unstructured-ingest
3
- Version: 0.3.0
3
+ Version: 0.3.2
4
4
  Summary: A library that prepares raw documents for downstream ML tasks.
5
5
  Home-page: https://github.com/Unstructured-IO/unstructured-ingest
6
6
  Author: Unstructured Technologies
@@ -22,37 +22,37 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
22
  Requires-Python: >=3.9.0,<3.13
23
23
  Description-Content-Type: text/markdown
24
24
  License-File: LICENSE.md
25
- Requires-Dist: tqdm
26
- Requires-Dist: pandas
27
- Requires-Dist: dataclasses-json
28
- Requires-Dist: opentelemetry-sdk
29
25
  Requires-Dist: python-dateutil
30
26
  Requires-Dist: pydantic>=2.7
27
+ Requires-Dist: opentelemetry-sdk
31
28
  Requires-Dist: click
29
+ Requires-Dist: tqdm
30
+ Requires-Dist: pandas
31
+ Requires-Dist: dataclasses-json
32
32
  Provides-Extra: airtable
33
33
  Requires-Dist: pyairtable; extra == "airtable"
34
34
  Provides-Extra: astradb
35
35
  Requires-Dist: astrapy; extra == "astradb"
36
36
  Provides-Extra: azure
37
- Requires-Dist: adlfs; extra == "azure"
38
37
  Requires-Dist: fsspec; extra == "azure"
38
+ Requires-Dist: adlfs; extra == "azure"
39
39
  Provides-Extra: azure-ai-search
40
40
  Requires-Dist: azure-search-documents; extra == "azure-ai-search"
41
41
  Provides-Extra: bedrock
42
42
  Requires-Dist: boto3; extra == "bedrock"
43
43
  Provides-Extra: biomed
44
- Requires-Dist: requests; extra == "biomed"
45
44
  Requires-Dist: bs4; extra == "biomed"
45
+ Requires-Dist: requests; extra == "biomed"
46
46
  Provides-Extra: box
47
- Requires-Dist: fsspec; extra == "box"
48
47
  Requires-Dist: boxfs; extra == "box"
48
+ Requires-Dist: fsspec; extra == "box"
49
49
  Provides-Extra: chroma
50
50
  Requires-Dist: chromadb; extra == "chroma"
51
51
  Provides-Extra: clarifai
52
52
  Requires-Dist: clarifai; extra == "clarifai"
53
53
  Provides-Extra: confluence
54
- Requires-Dist: requests; extra == "confluence"
55
54
  Requires-Dist: atlassian-python-api; extra == "confluence"
55
+ Requires-Dist: requests; extra == "confluence"
56
56
  Provides-Extra: couchbase
57
57
  Requires-Dist: couchbase; extra == "couchbase"
58
58
  Provides-Extra: csv
@@ -69,8 +69,8 @@ Requires-Dist: unstructured[docx]; extra == "doc"
69
69
  Provides-Extra: docx
70
70
  Requires-Dist: unstructured[docx]; extra == "docx"
71
71
  Provides-Extra: dropbox
72
- Requires-Dist: fsspec; extra == "dropbox"
73
72
  Requires-Dist: dropboxdrivefs; extra == "dropbox"
73
+ Requires-Dist: fsspec; extra == "dropbox"
74
74
  Provides-Extra: elasticsearch
75
75
  Requires-Dist: elasticsearch[async]; extra == "elasticsearch"
76
76
  Provides-Extra: embed-huggingface
@@ -78,8 +78,8 @@ Requires-Dist: sentence-transformers; extra == "embed-huggingface"
78
78
  Provides-Extra: embed-mixedbreadai
79
79
  Requires-Dist: mixedbread-ai; extra == "embed-mixedbreadai"
80
80
  Provides-Extra: embed-octoai
81
- Requires-Dist: openai; extra == "embed-octoai"
82
81
  Requires-Dist: tiktoken; extra == "embed-octoai"
82
+ Requires-Dist: openai; extra == "embed-octoai"
83
83
  Provides-Extra: embed-vertexai
84
84
  Requires-Dist: vertexai; extra == "embed-vertexai"
85
85
  Provides-Extra: embed-voyageai
@@ -87,19 +87,19 @@ Requires-Dist: voyageai; extra == "embed-voyageai"
87
87
  Provides-Extra: epub
88
88
  Requires-Dist: unstructured[epub]; extra == "epub"
89
89
  Provides-Extra: gcs
90
- Requires-Dist: fsspec; extra == "gcs"
91
90
  Requires-Dist: bs4; extra == "gcs"
92
91
  Requires-Dist: gcsfs; extra == "gcs"
92
+ Requires-Dist: fsspec; extra == "gcs"
93
93
  Provides-Extra: github
94
- Requires-Dist: pygithub>1.58.0; extra == "github"
95
94
  Requires-Dist: requests; extra == "github"
95
+ Requires-Dist: pygithub>1.58.0; extra == "github"
96
96
  Provides-Extra: gitlab
97
97
  Requires-Dist: python-gitlab; extra == "gitlab"
98
98
  Provides-Extra: google-drive
99
99
  Requires-Dist: google-api-python-client; extra == "google-drive"
100
100
  Provides-Extra: hubspot
101
- Requires-Dist: hubspot-api-client; extra == "hubspot"
102
101
  Requires-Dist: urllib3; extra == "hubspot"
102
+ Requires-Dist: hubspot-api-client; extra == "hubspot"
103
103
  Provides-Extra: jira
104
104
  Requires-Dist: atlassian-python-api; extra == "jira"
105
105
  Provides-Extra: kafka
@@ -115,19 +115,19 @@ Requires-Dist: pymongo; extra == "mongodb"
115
115
  Provides-Extra: msg
116
116
  Requires-Dist: unstructured[msg]; extra == "msg"
117
117
  Provides-Extra: notion
118
- Requires-Dist: httpx; extra == "notion"
119
118
  Requires-Dist: backoff; extra == "notion"
120
119
  Requires-Dist: htmlBuilder; extra == "notion"
121
120
  Requires-Dist: notion-client; extra == "notion"
121
+ Requires-Dist: httpx; extra == "notion"
122
122
  Provides-Extra: odt
123
123
  Requires-Dist: unstructured[odt]; extra == "odt"
124
124
  Provides-Extra: onedrive
125
+ Requires-Dist: bs4; extra == "onedrive"
125
126
  Requires-Dist: msal; extra == "onedrive"
126
127
  Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
127
- Requires-Dist: bs4; extra == "onedrive"
128
128
  Provides-Extra: openai
129
- Requires-Dist: openai; extra == "openai"
130
129
  Requires-Dist: tiktoken; extra == "openai"
130
+ Requires-Dist: openai; extra == "openai"
131
131
  Provides-Extra: opensearch
132
132
  Requires-Dist: opensearch-py; extra == "opensearch"
133
133
  Provides-Extra: org
@@ -156,8 +156,8 @@ Requires-Dist: unstructured[rst]; extra == "rst"
156
156
  Provides-Extra: rtf
157
157
  Requires-Dist: unstructured[rtf]; extra == "rtf"
158
158
  Provides-Extra: s3
159
- Requires-Dist: s3fs; extra == "s3"
160
159
  Requires-Dist: fsspec; extra == "s3"
160
+ Requires-Dist: s3fs; extra == "s3"
161
161
  Provides-Extra: salesforce
162
162
  Requires-Dist: simple-salesforce; extra == "salesforce"
163
163
  Provides-Extra: sftp
@@ -171,8 +171,8 @@ Requires-Dist: singlestoredb; extra == "singlestore"
171
171
  Provides-Extra: slack
172
172
  Requires-Dist: slack-sdk[optional]; extra == "slack"
173
173
  Provides-Extra: snowflake
174
- Requires-Dist: snowflake-connector-python; extra == "snowflake"
175
174
  Requires-Dist: psycopg2-binary; extra == "snowflake"
175
+ Requires-Dist: snowflake-connector-python; extra == "snowflake"
176
176
  Provides-Extra: togetherai
177
177
  Requires-Dist: together; extra == "togetherai"
178
178
  Provides-Extra: tsv
@@ -5,27 +5,37 @@ test/integration/chunkers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJW
5
5
  test/integration/chunkers/test_chunkers.py,sha256=pqn1Rqh36jZTJL4qpU0iuOMFAEQ-LrKAPOgWtQMAt_I,1482
6
6
  test/integration/connectors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
7
  test/integration/connectors/conftest.py,sha256=6dVNMBrL6WIO4KXA-0nf2tNrPYk_tsor8uomi6fbi3Q,727
8
- test/integration/connectors/test_astradb.py,sha256=Zy0xVOV60HOsvGKM4ankBE_X5ST37PBzR3iusk7DsEc,3492
9
- test/integration/connectors/test_azure_cog_search.py,sha256=dae4GifRiKue5YpsxworDiaMQoMsxcPDBithb6OFkx4,8876
8
+ test/integration/connectors/test_astradb.py,sha256=QPFrODXmOHagpuKaiooxXb3OEW93w2g4fmq8BkaBCnY,5303
9
+ test/integration/connectors/test_azure_ai_search.py,sha256=dae4GifRiKue5YpsxworDiaMQoMsxcPDBithb6OFkx4,8876
10
10
  test/integration/connectors/test_confluence.py,sha256=xcPmZ_vi_pkCt-tUPn10P49FH9i_9YUbrAPO6fYk5rU,3521
11
- test/integration/connectors/test_delta_table.py,sha256=4_KPyQJpd6DmyIjjtXWPMw6NNf7xULRkxmqfbvmZ80g,5018
12
- test/integration/connectors/test_kafka.py,sha256=3-OtZFZ93aCfmP0fUJzHJG7BBOfM5uCOtCLVHarsnMs,5869
11
+ test/integration/connectors/test_delta_table.py,sha256=GSzWIkbEUzOrRPt2F1uO0dabcp7kTFDj75BhhI2y-WU,6856
12
+ test/integration/connectors/test_kafka.py,sha256=j7jsNWZumNBv9v-5Bpx8geUUXpxxad5EuA4CMRsl4R8,7104
13
+ test/integration/connectors/test_lancedb.py,sha256=O3YF6MVBkCsCgklXCJe8Kpy8aKGfafASVH4PspmpcYs,7628
14
+ test/integration/connectors/test_milvus.py,sha256=CVmYw9iEeKT_0OtShxye2E6i1LbWzzDA8JtwJRkYQlA,4763
15
+ test/integration/connectors/test_mongodb.py,sha256=YeS_DUnVYN02F76j87W8RhXGHnJMzQYb3n-L1-oWGXI,12254
13
16
  test/integration/connectors/test_onedrive.py,sha256=KIkBwKh1hnv203VCL2UABnDkS_bP4NxOFm1AL8EPGLA,3554
14
- test/integration/connectors/test_pinecone.py,sha256=809YADKRrdYnoXAd7HYaNCP3XJG7nb24NzOJkNu44nI,5535
17
+ test/integration/connectors/test_pinecone.py,sha256=X10OWZ6IrO6YyhuR3ydMAZOQq3u2f5u_lCjKNYUUcnI,7558
15
18
  test/integration/connectors/test_qdrant.py,sha256=ASvO-BNyhv8m8or28KljrJy27Da0uaTNeoR5w_QsvFg,5121
16
19
  test/integration/connectors/test_s3.py,sha256=YHEYMqWTKTfR7wlL4VoxtgMs1YiYKyhLIBdG-anaQGo,6896
17
20
  test/integration/connectors/databricks_tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
21
  test/integration/connectors/databricks_tests/test_volumes_native.py,sha256=k4lALbwNtlyuI3wd3OHoBULI21E3Ck2Fo8EJXaVfwgw,5812
22
+ test/integration/connectors/elasticsearch/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
23
+ test/integration/connectors/elasticsearch/conftest.py,sha256=-i4_7MkIxSQENz7nuD2uHuhGU9mZ33vpeTPhHtRpQfs,989
24
+ test/integration/connectors/elasticsearch/test_elasticsearch.py,sha256=nqdHwBpvgk_74orzDaQIKALK5cb0YloxSdt7QDJX0r0,11169
25
+ test/integration/connectors/elasticsearch/test_opensearch.py,sha256=Rk4tQ_Qv5icycDWMUpnzTbg-QzwGyb6nKqB0gDef9D0,10555
19
26
  test/integration/connectors/sql/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
20
- test/integration/connectors/sql/test_postgres.py,sha256=gDBuNyvWmpVPmDrSSYC99z3t17B_a196P1MwIAOp5Dk,6584
21
- test/integration/connectors/sql/test_singlestore.py,sha256=wGI3-lc6qh0qN4-WD9VtiXBB9MlekeqK402_9EXQyX0,5876
22
- test/integration/connectors/sql/test_snowflake.py,sha256=XXU2-2z_k8jHWP684v2IuaGOlV3cmPpg3RxkwMp08v8,6998
23
- test/integration/connectors/sql/test_sqlite.py,sha256=51QrFufAq-XxNjHAkmPWxdJUkGdIRRIGKeRT09A5pkA,5704
27
+ test/integration/connectors/sql/test_postgres.py,sha256=lrymDI7bVX_4qij5gsUc_bTvHPeelu6hpJemQ6WWmlY,6783
28
+ test/integration/connectors/sql/test_singlestore.py,sha256=iCp9q6tzhNIUCUubCPiRKj6VmJnwot4JGo9fkkTHg_U,5960
29
+ test/integration/connectors/sql/test_snowflake.py,sha256=DqQIV9H5Uv7HaHtDyrAPdqefd316oVt5lKtdJ2Zdk6Q,7082
30
+ test/integration/connectors/sql/test_sqlite.py,sha256=gSfp2hXAb5BGknzZXVa7K5bBwEb5Li4k5493mQCFjBQ,5719
24
31
  test/integration/connectors/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
25
32
  test/integration/connectors/utils/constants.py,sha256=0zSPnsZVqJuNhXduXvdXFQLZTRIQa5Fo_1qjBYVCfb8,209
26
- test/integration/connectors/utils/docker.py,sha256=JxfX8u46YwpqUnVGd4syI0SrqGqvGQx9yBN0Xq-bIKE,2328
33
+ test/integration/connectors/utils/docker.py,sha256=lnSjRgYoQa5c5nBdg2eLkB8KJVOjk4eyqq_C6PtTkME,4806
27
34
  test/integration/connectors/utils/docker_compose.py,sha256=GVTB6Cel05c0VQ2n4AwkQQx_cBfz13ZTs1HpbaYipNU,2223
28
- test/integration/connectors/utils/validation.py,sha256=5rQOBJyu1etvuwJmkH6xvKUPF08AKwJRxlN4L7-nw9w,13894
35
+ test/integration/connectors/utils/validation.py,sha256=SwvPVuHjJxTo8xEUwnuL9FZNpu3sZZ8iouOz5xh_kB8,14272
36
+ test/integration/connectors/weaviate/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
37
+ test/integration/connectors/weaviate/conftest.py,sha256=6Q6QdrLJmGHowRFSmoVSzup2EX6qASfS2Z5tqlpTm9M,387
38
+ test/integration/connectors/weaviate/test_local.py,sha256=SK6iEwQUKiCd0X99BEk8GlQoLaCcJcFPt09NN526Ct0,4508
29
39
  test/integration/embedders/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
30
40
  test/integration/embedders/conftest.py,sha256=B2W771RbijR7G_GybsCzRyIvOzXqzbKZdRIlNDd5AGY,334
31
41
  test/integration/embedders/test_bedrock.py,sha256=0oBRNS_DtFDGQ22Z1T3t6VOJ31PrItgvnJpqcLe9Fg4,1903
@@ -71,7 +81,7 @@ test/unit/v2/partitioners/test_partitioner.py,sha256=iIYg7IpftV3LusoO4H8tr1IHY1U
71
81
  test/unit/v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
72
82
  test/unit/v2/utils/data_generator.py,sha256=UoYVNjG4S4wlaA9gceQ82HIpF9_6I1UTHD1_GrQBHp0,973
73
83
  unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
74
- unstructured_ingest/__version__.py,sha256=lWtlg90A2bUoi9oMXDJVdgZ8UO2vchSsWKV19YBO4f0,42
84
+ unstructured_ingest/__version__.py,sha256=Js7MXQhyIj1akVjPNsLkmZxqoOHDGOr2opEPgFOSTZQ,42
75
85
  unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
76
86
  unstructured_ingest/interfaces.py,sha256=OYVUP0bzBJpT-Lz92BDyz_hLBvyfxkuSwWHhUdnUayA,31493
77
87
  unstructured_ingest/logger.py,sha256=S5nSqGcABoQyeicgRnBQFjDScCaTvFVivOCvbo-laL0,4479
@@ -267,7 +277,7 @@ unstructured_ingest/pipeline/utils.py,sha256=RNx4bv2FhKOhaK_YTiRubta7n9wmJwqzznF
267
277
  unstructured_ingest/pipeline/write.py,sha256=xmDjmbieGRrcI342he7PkgxWaMoSJ5nWPmP5AM2xloU,669
268
278
  unstructured_ingest/pipeline/reformat/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
269
279
  unstructured_ingest/pipeline/reformat/chunking.py,sha256=vbJgi2Yl9Rd9yZxIf64Nxj6cjUJnJWRpDCagswQmrLw,6040
270
- unstructured_ingest/pipeline/reformat/embedding.py,sha256=ohNvW9MhVGKVCx8ZlnLlLgkFQ_6UYLA7yUwT7Bzj9I8,2522
280
+ unstructured_ingest/pipeline/reformat/embedding.py,sha256=vyRgrNvz50eMOCO00YdV9ODK0LRIB3_NF6t1mWD01uc,2525
271
281
  unstructured_ingest/runner/__init__.py,sha256=FO0X_jBIMilXdyjBajyFmzHoC3eVypNMGlhdOW4mcCM,2859
272
282
  unstructured_ingest/runner/airtable.py,sha256=1ndJ6PKT63E0gZN3KYFBj4Yo94zQYsIvSjC6ro2nIPE,1115
273
283
  unstructured_ingest/runner/astradb.py,sha256=FSBtQrsdC9E3eHUcAuQ0apcCnWolz-9tkvy-Uf7QeKg,1102
@@ -329,7 +339,7 @@ unstructured_ingest/runner/writers/fsspec/s3.py,sha256=kHJq2O3864QBd_tL2SKb0mdyw
329
339
  unstructured_ingest/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
330
340
  unstructured_ingest/utils/chunking.py,sha256=efWEfMcCukG5zASZrXhkNgAX8AzHa6t3rClMzm2TwFE,1521
331
341
  unstructured_ingest/utils/compression.py,sha256=NNiY-2S2Gf3at7zC1PYxMijaEza9vVSzRn5mdFf6mHo,4434
332
- unstructured_ingest/utils/data_prep.py,sha256=9UKewDHB8-cMlQ8POvokhjVsy-ksiSqAAW2ibqPYAfk,4400
342
+ unstructured_ingest/utils/data_prep.py,sha256=IDAedOSBdgZpD9IY4tLJT-rmKGV7GHtU6KRj6VM-_tE,4666
333
343
  unstructured_ingest/utils/dep_check.py,sha256=SXXcUna2H0RtxA6j1S2NGkvQa9JP2DujWhmyBa7776Y,2400
334
344
  unstructured_ingest/utils/google_filetype.py,sha256=YVspEkiiBrRUSGVeVbsavvLvTmizdy2e6TsjigXTSRU,468
335
345
  unstructured_ingest/utils/string_and_date_utils.py,sha256=LwcbLmWpwt1zEabLlyUd5kIf9oOWcZxsRzxDglLCMeU,1375
@@ -382,36 +392,36 @@ unstructured_ingest/v2/processes/embedder.py,sha256=PQn0IO8xbGRQHpcT2VVl-J8gTJ5H
382
392
  unstructured_ingest/v2/processes/filter.py,sha256=kjUmMw2SDq2bme0JCAOxs6cJriIG6Ty09KOznS-xz08,2145
383
393
  unstructured_ingest/v2/processes/partitioner.py,sha256=agpHwB9FR8OZVQqE7zFEb0IcDPCOPA_BZjLzLF71nOY,8194
384
394
  unstructured_ingest/v2/processes/uncompress.py,sha256=Z_XfsITGdyaRwhtNUc7bMj5Y2jLuBge8KoK4nxhqKag,2425
385
- unstructured_ingest/v2/processes/connectors/__init__.py,sha256=ORSxrryPZErHAZTC3sp3UhWCh3G1B2SzTIM4H4OdVCc,5862
395
+ unstructured_ingest/v2/processes/connectors/__init__.py,sha256=8M3aYYNbOkS2SYG2B_HLHMgX4V69-Oz1VqpQcRQMiVg,5167
386
396
  unstructured_ingest/v2/processes/connectors/airtable.py,sha256=eeZJe-bBNxt5Sa-XEFCdcGeJCguJU5WN2Mv9kLp5dVQ,8917
387
- unstructured_ingest/v2/processes/connectors/astradb.py,sha256=n5RT1l8pHbZG7m-CLKhWGCuWgfpeuIzvOZv7UAmTE6c,14683
388
- unstructured_ingest/v2/processes/connectors/azure_ai_search.py,sha256=5EXu565yDxjg7Iz0PO2mljwPnZVGYuWomNsbnMUOW_I,11813
397
+ unstructured_ingest/v2/processes/connectors/astradb.py,sha256=zsIElFNNqVCXcLqBw6C8bRoyPQDrGNPkTWeA0FYYO94,14703
398
+ unstructured_ingest/v2/processes/connectors/azure_ai_search.py,sha256=-6IijSWGqj-85vD0c4l5wdMHp-LF371jO8j53PPRB4I,12002
389
399
  unstructured_ingest/v2/processes/connectors/chroma.py,sha256=skrxRPHZ8y3JxNa0dt5SVitHiDQ5WVxLvY_kh2-QUrQ,8029
390
400
  unstructured_ingest/v2/processes/connectors/confluence.py,sha256=qQApDcmPBGg4tHXwSOj4JPkAbrO9GQ4NRlaETjhp25U,7003
391
401
  unstructured_ingest/v2/processes/connectors/couchbase.py,sha256=yhMDbpkZXs-Kis7tFlgjvNemU-MdWMdpCZDrpZNFaU4,12180
392
- unstructured_ingest/v2/processes/connectors/delta_table.py,sha256=ckdM6Z_hcltbtHdgkPi7_wntUvZSumAt7eQCxbmM4rQ,7480
393
- unstructured_ingest/v2/processes/connectors/elasticsearch.py,sha256=lNvUbbTMv2ZKxRN6cesfD2AeQc1kQG9AKqY9RHBfVXs,16796
402
+ unstructured_ingest/v2/processes/connectors/delta_table.py,sha256=1yS7ivEyiucwd_kv6LL5HQdGabT43yeG6XCdwiz89hc,8019
394
403
  unstructured_ingest/v2/processes/connectors/gitlab.py,sha256=yBgCeLy9iCVI8bBDcHHuHB0H3BO05e9E1OccbHwvKAo,9724
395
- unstructured_ingest/v2/processes/connectors/google_drive.py,sha256=a1BAvhX3nsgghjuR5CJ1lOwMtJ5ZJwimg6VtDYvluxA,13104
404
+ unstructured_ingest/v2/processes/connectors/google_drive.py,sha256=EEwXK1Anlu-eXl5qxmdDIqPYW7eMSez6WGlTPG2vSn8,13121
396
405
  unstructured_ingest/v2/processes/connectors/kdbai.py,sha256=8bGHbZctJ_Tl1AUSMnI7CCZ7CgEtTRVcRuvlB1HPlqQ,5907
397
406
  unstructured_ingest/v2/processes/connectors/local.py,sha256=a3stgnIkhBbXPIQD0O-RaRM-Eb-szHj9Yy4Fz881-9c,6723
398
- unstructured_ingest/v2/processes/connectors/milvus.py,sha256=ZUlyAQyTt0U1JoapFYHQW3IIaGYY50b3URDSLEAFjtk,7687
399
- unstructured_ingest/v2/processes/connectors/mongodb.py,sha256=A0pt6JcNTD5bEu79jZ8KhnHcBQ2VUJ2AjtQAtdFr_Lo,13175
407
+ unstructured_ingest/v2/processes/connectors/milvus.py,sha256=Bzv2fa852BcM4_Pr-I_DPvLmjPoXv0Z7BeEA8qSKCDc,9725
408
+ unstructured_ingest/v2/processes/connectors/mongodb.py,sha256=XLuprTCY0D9tAh_qn81MjJrDN9YaNqMlKe7BJl3eTZc,14998
400
409
  unstructured_ingest/v2/processes/connectors/onedrive.py,sha256=heZMtOIrCySi552ldIk8iH0pSRXZ0W2LeD-CcNOwCFQ,15979
401
- unstructured_ingest/v2/processes/connectors/opensearch.py,sha256=dfDSNrWIEk19wuHdlMJpp_SLMOteNPlkDBPlAwu1LVY,6767
402
410
  unstructured_ingest/v2/processes/connectors/outlook.py,sha256=KgNGM8hImRhy6_SpswRP2VwRD4VOrqqJoySgxf2oduI,9290
403
- unstructured_ingest/v2/processes/connectors/pinecone.py,sha256=Fh7G0gam49HSxn6SoWIIgqYTBKkY34u6LzjZmJB7fMI,10762
411
+ unstructured_ingest/v2/processes/connectors/pinecone.py,sha256=hWkXgVDAzCtrBxf7A4HoexBACGAfVf_Qvn9YHbeiBSY,11505
404
412
  unstructured_ingest/v2/processes/connectors/salesforce.py,sha256=2CiO2ZZiZ1Y1-nB7wcDlDVcpW2B7ut9wCj66rkkqho0,11616
405
413
  unstructured_ingest/v2/processes/connectors/sharepoint.py,sha256=Ndn2Wm7RupfjAtlLxxQwJueeE0V8aGMbNVPuFq9nqdQ,19730
406
414
  unstructured_ingest/v2/processes/connectors/slack.py,sha256=Z73VmQ3oUY09KoLEi5OBdQeDt4ONEY_02SglWQc6HXE,9252
407
415
  unstructured_ingest/v2/processes/connectors/utils.py,sha256=8kd0g7lo9NqnpaIkjeO-Ut6erhwUNH_gS9koevpe3WE,878
408
- unstructured_ingest/v2/processes/connectors/weaviate.py,sha256=Ss0YyD5T6k-00eJ6dr5lSo2H0LcOjVTMmozehyTvnAo,8866
409
416
  unstructured_ingest/v2/processes/connectors/databricks/__init__.py,sha256=jO71UTC7bLA_N12CrLWJzh_yZML5gfT7VohxzCpUGWg,1848
410
417
  unstructured_ingest/v2/processes/connectors/databricks/volumes.py,sha256=p7sjCYZb7JmY3v3Xy1gm-q0O7oamLTsSFf2EWXYfXYQ,6447
411
418
  unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py,sha256=I1MJwe5LOxoPLjwo00H0XbXO6u_SJHWYgsj4s6ePoyI,2754
412
419
  unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py,sha256=P4rfcE3td7WyuuguRgUnGQytCMDpfeYrrpshBZuVynY,3539
413
420
  unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py,sha256=UUotY_-HpgSEJkvdQfZTlbxY7CRLZ4ctL8TlryeFvxk,2790
414
421
  unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py,sha256=Wk7s2_u5G0BOV5slvGc8IlUf7ivznY9PrgPqe6nlJKM,2897
422
+ unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py,sha256=Zzc0JNPP-eFqpwWw1Gp-XC8H-s__IgkYKzoagECycZY,829
423
+ unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py,sha256=lzbrQ66zz3Dh_G29XFkyzQ84St8H_xfQVsYV4mTf32c,19141
424
+ unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py,sha256=qRz8Fyr2RSZIPZGkhPeme6AZxM0aX-c_xOa1ZtSr2Kg,6781
415
425
  unstructured_ingest/v2/processes/connectors/fsspec/__init__.py,sha256=TtdeImM7Ypl_n6sl7I1JqX6bGSG0t_FqvCqE3Cy24og,1846
416
426
  unstructured_ingest/v2/processes/connectors/fsspec/azure.py,sha256=Y01BuVRql0Kvzc_cdaZE9dDGYjJzrwJu-etfUrEGcUU,7061
417
427
  unstructured_ingest/v2/processes/connectors/fsspec/box.py,sha256=Cjk0LUxqOCDbme0GmnD_5_b1hfStjI23cKw6BquKNrg,5488
@@ -423,8 +433,14 @@ unstructured_ingest/v2/processes/connectors/fsspec/sftp.py,sha256=dwpyqDq0qceCBW
423
433
  unstructured_ingest/v2/processes/connectors/fsspec/utils.py,sha256=jec_Qfe2hbfahBuY-u8FnvHuv933AI5HwPFjOL3kEEY,456
424
434
  unstructured_ingest/v2/processes/connectors/kafka/__init__.py,sha256=mQJ9Ex-QCfhz-BB5YWTfbPf7xGLd1i7FpjRr0ukbhNw,754
425
435
  unstructured_ingest/v2/processes/connectors/kafka/cloud.py,sha256=qprsfI8VH0mVTa1MOCpa2D4coyopinQ5ag2KXcAecXE,3296
426
- unstructured_ingest/v2/processes/connectors/kafka/kafka.py,sha256=BkbozVTrDBingDuH8gTRiF5rceHoM1D3eibhl1pKgZQ,9092
436
+ unstructured_ingest/v2/processes/connectors/kafka/kafka.py,sha256=qEv_yaG94KekFtfS06KgpTTbqeJkje0hn5uOjsMMngw,9414
427
437
  unstructured_ingest/v2/processes/connectors/kafka/local.py,sha256=vwLZjvc_C17zOqcrzic0aIoPwS98sqYiwiMknw2IcK4,2586
438
+ unstructured_ingest/v2/processes/connectors/lancedb/__init__.py,sha256=lHUPCOiyOGu1IME1QiyFBZaB8z8e3bP8Y8TkqKs32Qk,906
439
+ unstructured_ingest/v2/processes/connectors/lancedb/aws.py,sha256=yR8V4O-oI_nUKJtHTLxhteEJpPDPn-_d2IkkXvgThJ0,1406
440
+ unstructured_ingest/v2/processes/connectors/lancedb/azure.py,sha256=Ms5vQVRIpTF1Q2qBl_bET9wbgaf4diPaH-iR8kJlr4E,1461
441
+ unstructured_ingest/v2/processes/connectors/lancedb/gcp.py,sha256=p5BPaFtS3y3Yh8PIr3tUqsAXrUYu4QYYAWQNh5W2ucE,1361
442
+ unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py,sha256=7WIShs2V3dpN6wUhDTt1j2rvdiPp6yopbh7XYkb9T3s,5129
443
+ unstructured_ingest/v2/processes/connectors/lancedb/local.py,sha256=_7-6iO6B60gAWwJUUrmlsRzYMFIBeZgu_QT3mhw5L0I,1272
428
444
  unstructured_ingest/v2/processes/connectors/qdrant/__init__.py,sha256=xM19uYzAuGizVoZIM_hnVZ5AcBN69aOBGpqZcpWPtuE,760
429
445
  unstructured_ingest/v2/processes/connectors/qdrant/cloud.py,sha256=accJ4sNWBVWV-KiVBDBDBYYx5A9CUoikP5NCErRmfik,1624
430
446
  unstructured_ingest/v2/processes/connectors/qdrant/local.py,sha256=cGEyv3Oy6y4BQ4DU8yhJWMpL82QYwBVdPTxxNuV127U,1588
@@ -434,11 +450,16 @@ unstructured_ingest/v2/processes/connectors/sql/__init__.py,sha256=D43wrV2ADvQsT
434
450
  unstructured_ingest/v2/processes/connectors/sql/postgres.py,sha256=rHaSb1MtdWMY6eQL2i2cWSL4w0VApFTChzmWtyfvFTI,5140
435
451
  unstructured_ingest/v2/processes/connectors/sql/singlestore.py,sha256=YrmhAL1RQ1c5-2fnR3UAyj_4KfvjYTQ2cWzpvsdJOnU,5535
436
452
  unstructured_ingest/v2/processes/connectors/sql/snowflake.py,sha256=jl524VudwmFK63emCT7DmZan_EWJAMiGir5_zoO9FuY,5697
437
- unstructured_ingest/v2/processes/connectors/sql/sql.py,sha256=Jwu3ZC4PGEw9la72cOwC3tclYAoBXFQTII9Mhh8ziP4,11571
453
+ unstructured_ingest/v2/processes/connectors/sql/sql.py,sha256=LFzGeAUagLknK07DsXg2oSG7ZAgR6VqT9wfI_tYlHUg,14782
438
454
  unstructured_ingest/v2/processes/connectors/sql/sqlite.py,sha256=9605K36nQ5-gBxzt1daYKYotON1SE85RETusqCJrbdk,5230
439
- unstructured_ingest-0.3.0.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
440
- unstructured_ingest-0.3.0.dist-info/METADATA,sha256=nn2t6UfzgYb6sr02uA_ixY-OQmcMwokknQ07Q9Kzdq0,7326
441
- unstructured_ingest-0.3.0.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
442
- unstructured_ingest-0.3.0.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
443
- unstructured_ingest-0.3.0.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
444
- unstructured_ingest-0.3.0.dist-info/RECORD,,
455
+ unstructured_ingest/v2/processes/connectors/weaviate/__init__.py,sha256=eXamSnQdzzMvt62z80B8nmlkwDKO-Pogln_K_zLz53A,1067
456
+ unstructured_ingest/v2/processes/connectors/weaviate/cloud.py,sha256=2g1Fm2J0ppfy2jCw4b5YtrsWrSD3VcrAaqiE7FlpIAg,6236
457
+ unstructured_ingest/v2/processes/connectors/weaviate/embedded.py,sha256=S8Zg8StuZT-k7tCg1D5YShO1-vJYYk9-M1bE1fIqx64,3014
458
+ unstructured_ingest/v2/processes/connectors/weaviate/local.py,sha256=LuTBKPseVewsz8VqxRPRLfGEm3BeI9nBZxpy7ZU5tOA,2201
459
+ unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py,sha256=ln1p9ahFTaT-qsL7p4bgw_IqnU60As_l6vVAqUWyQVE,11655
460
+ unstructured_ingest-0.3.2.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
461
+ unstructured_ingest-0.3.2.dist-info/METADATA,sha256=rqTWqewB8eIrgrHJ-8AsNtehy35eSHKseCsveXTwN3Y,7326
462
+ unstructured_ingest-0.3.2.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
463
+ unstructured_ingest-0.3.2.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
464
+ unstructured_ingest-0.3.2.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
465
+ unstructured_ingest-0.3.2.dist-info/RECORD,,
@@ -1,242 +0,0 @@
1
- import json
2
- from dataclasses import dataclass, field
3
- from datetime import date, datetime
4
- from pathlib import Path
5
- from typing import TYPE_CHECKING, Any, Optional
6
-
7
- from dateutil import parser
8
- from pydantic import Field, Secret
9
-
10
- from unstructured_ingest.error import DestinationConnectionError
11
- from unstructured_ingest.utils.dep_check import requires_dependencies
12
- from unstructured_ingest.v2.interfaces import (
13
- AccessConfig,
14
- ConnectionConfig,
15
- FileData,
16
- Uploader,
17
- UploaderConfig,
18
- UploadStager,
19
- UploadStagerConfig,
20
- )
21
- from unstructured_ingest.v2.logger import logger
22
- from unstructured_ingest.v2.processes.connector_registry import (
23
- DestinationRegistryEntry,
24
- )
25
-
26
- if TYPE_CHECKING:
27
- from weaviate import Client
28
-
29
- CONNECTOR_TYPE = "weaviate"
30
-
31
-
32
- class WeaviateAccessConfig(AccessConfig):
33
- access_token: Optional[str] = Field(
34
- default=None, description="Used to create the bearer token."
35
- )
36
- api_key: Optional[str] = None
37
- client_secret: Optional[str] = None
38
- password: Optional[str] = None
39
-
40
-
41
- class WeaviateConnectionConfig(ConnectionConfig):
42
- host_url: str = Field(description="Weaviate instance url")
43
- class_name: str = Field(
44
- description="Name of the class to push the records into, e.g: Pdf-elements"
45
- )
46
- access_config: Secret[WeaviateAccessConfig] = Field(
47
- default=WeaviateAccessConfig(), validate_default=True
48
- )
49
- username: Optional[str] = None
50
- anonymous: bool = Field(default=False, description="if set, all auth values will be ignored")
51
- scope: Optional[list[str]] = None
52
- refresh_token: Optional[str] = Field(
53
- default=None,
54
- description="Will tie this value to the bearer token. If not provided, "
55
- "the authentication will expire once the lifetime of the access token is up.",
56
- )
57
- connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
58
-
59
-
60
- class WeaviateUploadStagerConfig(UploadStagerConfig):
61
- pass
62
-
63
-
64
- @dataclass
65
- class WeaviateUploadStager(UploadStager):
66
- upload_stager_config: WeaviateUploadStagerConfig = field(
67
- default_factory=lambda: WeaviateUploadStagerConfig()
68
- )
69
-
70
- @staticmethod
71
- def parse_date_string(date_string: str) -> date:
72
- try:
73
- timestamp = float(date_string)
74
- return datetime.fromtimestamp(timestamp)
75
- except Exception as e:
76
- logger.debug(f"date {date_string} string not a timestamp: {e}")
77
- return parser.parse(date_string)
78
-
79
- @classmethod
80
- def conform_dict(cls, data: dict) -> None:
81
- """
82
- Updates the element dictionary to conform to the Weaviate schema
83
- """
84
-
85
- # Dict as string formatting
86
- if record_locator := data.get("metadata", {}).get("data_source", {}).get("record_locator"):
87
- # Explicit casting otherwise fails schema type checking
88
- data["metadata"]["data_source"]["record_locator"] = str(json.dumps(record_locator))
89
-
90
- # Array of items as string formatting
91
- if points := data.get("metadata", {}).get("coordinates", {}).get("points"):
92
- data["metadata"]["coordinates"]["points"] = str(json.dumps(points))
93
-
94
- if links := data.get("metadata", {}).get("links", {}):
95
- data["metadata"]["links"] = str(json.dumps(links))
96
-
97
- if permissions_data := (
98
- data.get("metadata", {}).get("data_source", {}).get("permissions_data")
99
- ):
100
- data["metadata"]["data_source"]["permissions_data"] = json.dumps(permissions_data)
101
-
102
- # Datetime formatting
103
- if date_created := data.get("metadata", {}).get("data_source", {}).get("date_created"):
104
- data["metadata"]["data_source"]["date_created"] = cls.parse_date_string(
105
- date_created
106
- ).strftime(
107
- "%Y-%m-%dT%H:%M:%S.%fZ",
108
- )
109
-
110
- if date_modified := data.get("metadata", {}).get("data_source", {}).get("date_modified"):
111
- data["metadata"]["data_source"]["date_modified"] = cls.parse_date_string(
112
- date_modified
113
- ).strftime(
114
- "%Y-%m-%dT%H:%M:%S.%fZ",
115
- )
116
-
117
- if date_processed := data.get("metadata", {}).get("data_source", {}).get("date_processed"):
118
- data["metadata"]["data_source"]["date_processed"] = cls.parse_date_string(
119
- date_processed
120
- ).strftime(
121
- "%Y-%m-%dT%H:%M:%S.%fZ",
122
- )
123
-
124
- if last_modified := data.get("metadata", {}).get("last_modified"):
125
- data["metadata"]["last_modified"] = cls.parse_date_string(last_modified).strftime(
126
- "%Y-%m-%dT%H:%M:%S.%fZ",
127
- )
128
-
129
- # String casting
130
- if version := data.get("metadata", {}).get("data_source", {}).get("version"):
131
- data["metadata"]["data_source"]["version"] = str(version)
132
-
133
- if page_number := data.get("metadata", {}).get("page_number"):
134
- data["metadata"]["page_number"] = str(page_number)
135
-
136
- if regex_metadata := data.get("metadata", {}).get("regex_metadata"):
137
- data["metadata"]["regex_metadata"] = str(json.dumps(regex_metadata))
138
-
139
- def run(
140
- self,
141
- elements_filepath: Path,
142
- file_data: FileData,
143
- output_dir: Path,
144
- output_filename: str,
145
- **kwargs: Any,
146
- ) -> Path:
147
- with open(elements_filepath) as elements_file:
148
- elements_contents = json.load(elements_file)
149
- for element in elements_contents:
150
- self.conform_dict(data=element)
151
- output_path = Path(output_dir) / Path(f"{output_filename}.json")
152
- with open(output_path, "w") as output_file:
153
- json.dump(elements_contents, output_file)
154
- return output_path
155
-
156
-
157
- class WeaviateUploaderConfig(UploaderConfig):
158
- batch_size: int = Field(default=100, description="Number of records per batch")
159
-
160
-
161
- @dataclass
162
- class WeaviateUploader(Uploader):
163
- upload_config: WeaviateUploaderConfig
164
- connection_config: WeaviateConnectionConfig
165
- connector_type: str = CONNECTOR_TYPE
166
-
167
- @requires_dependencies(["weaviate"], extras="weaviate")
168
- def get_client(self) -> "Client":
169
- from weaviate import Client
170
-
171
- auth = self._resolve_auth_method()
172
- return Client(url=self.connection_config.host_url, auth_client_secret=auth)
173
-
174
- def precheck(self) -> None:
175
- try:
176
- self.get_client()
177
- except Exception as e:
178
- logger.error(f"Failed to validate connection {e}", exc_info=True)
179
- raise DestinationConnectionError(f"failed to validate connection: {e}")
180
-
181
- @requires_dependencies(["weaviate"], extras="weaviate")
182
- def _resolve_auth_method(self):
183
- access_configs = self.connection_config.access_config.get_secret_value()
184
- connection_config = self.connection_config
185
- if connection_config.anonymous:
186
- return None
187
-
188
- if access_configs.access_token:
189
- from weaviate.auth import AuthBearerToken
190
-
191
- return AuthBearerToken(
192
- access_token=access_configs.access_token,
193
- refresh_token=connection_config.refresh_token,
194
- )
195
- elif access_configs.api_key:
196
- from weaviate.auth import AuthApiKey
197
-
198
- return AuthApiKey(api_key=access_configs.api_key)
199
- elif access_configs.client_secret:
200
- from weaviate.auth import AuthClientCredentials
201
-
202
- return AuthClientCredentials(
203
- client_secret=access_configs.client_secret, scope=connection_config.scope
204
- )
205
- elif connection_config.username and access_configs.password:
206
- from weaviate.auth import AuthClientPassword
207
-
208
- return AuthClientPassword(
209
- username=connection_config.username,
210
- password=access_configs.password,
211
- scope=connection_config.scope,
212
- )
213
- return None
214
-
215
- def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
216
- with path.open("r") as file:
217
- elements_dict = json.load(file)
218
- logger.info(
219
- f"writing {len(elements_dict)} objects to destination "
220
- f"class {self.connection_config.class_name} "
221
- f"at {self.connection_config.host_url}",
222
- )
223
-
224
- client = self.get_client()
225
- client.batch.configure(batch_size=self.upload_config.batch_size)
226
- with client.batch as b:
227
- for e in elements_dict:
228
- vector = e.pop("embeddings", None)
229
- b.add_data_object(
230
- e,
231
- self.connection_config.class_name,
232
- vector=vector,
233
- )
234
-
235
-
236
- weaviate_destination_entry = DestinationRegistryEntry(
237
- connection_config=WeaviateConnectionConfig,
238
- uploader=WeaviateUploader,
239
- uploader_config=WeaviateUploaderConfig,
240
- upload_stager=WeaviateUploadStager,
241
- upload_stager_config=WeaviateUploadStagerConfig,
242
- )