unstructured-ingest 0.3.0__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (51) hide show
  1. test/integration/connectors/elasticsearch/__init__.py +0 -0
  2. test/integration/connectors/elasticsearch/conftest.py +34 -0
  3. test/integration/connectors/elasticsearch/test_elasticsearch.py +308 -0
  4. test/integration/connectors/elasticsearch/test_opensearch.py +302 -0
  5. test/integration/connectors/sql/test_postgres.py +10 -4
  6. test/integration/connectors/sql/test_singlestore.py +8 -4
  7. test/integration/connectors/sql/test_snowflake.py +10 -6
  8. test/integration/connectors/sql/test_sqlite.py +4 -4
  9. test/integration/connectors/test_astradb.py +50 -3
  10. test/integration/connectors/test_delta_table.py +46 -0
  11. test/integration/connectors/test_kafka.py +40 -6
  12. test/integration/connectors/test_lancedb.py +209 -0
  13. test/integration/connectors/test_milvus.py +141 -0
  14. test/integration/connectors/test_pinecone.py +53 -1
  15. test/integration/connectors/utils/docker.py +81 -15
  16. test/integration/connectors/utils/validation.py +10 -0
  17. test/integration/connectors/weaviate/__init__.py +0 -0
  18. test/integration/connectors/weaviate/conftest.py +15 -0
  19. test/integration/connectors/weaviate/test_local.py +131 -0
  20. unstructured_ingest/__version__.py +1 -1
  21. unstructured_ingest/pipeline/reformat/embedding.py +1 -1
  22. unstructured_ingest/utils/data_prep.py +9 -1
  23. unstructured_ingest/v2/processes/connectors/__init__.py +3 -16
  24. unstructured_ingest/v2/processes/connectors/astradb.py +2 -2
  25. unstructured_ingest/v2/processes/connectors/azure_ai_search.py +4 -0
  26. unstructured_ingest/v2/processes/connectors/delta_table.py +20 -4
  27. unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py +19 -0
  28. unstructured_ingest/v2/processes/connectors/{elasticsearch.py → elasticsearch/elasticsearch.py} +92 -46
  29. unstructured_ingest/v2/processes/connectors/{opensearch.py → elasticsearch/opensearch.py} +1 -1
  30. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +6 -0
  31. unstructured_ingest/v2/processes/connectors/lancedb/__init__.py +17 -0
  32. unstructured_ingest/v2/processes/connectors/lancedb/aws.py +43 -0
  33. unstructured_ingest/v2/processes/connectors/lancedb/azure.py +43 -0
  34. unstructured_ingest/v2/processes/connectors/lancedb/gcp.py +44 -0
  35. unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +161 -0
  36. unstructured_ingest/v2/processes/connectors/lancedb/local.py +44 -0
  37. unstructured_ingest/v2/processes/connectors/milvus.py +72 -27
  38. unstructured_ingest/v2/processes/connectors/pinecone.py +24 -7
  39. unstructured_ingest/v2/processes/connectors/sql/sql.py +97 -26
  40. unstructured_ingest/v2/processes/connectors/weaviate/__init__.py +22 -0
  41. unstructured_ingest/v2/processes/connectors/weaviate/cloud.py +164 -0
  42. unstructured_ingest/v2/processes/connectors/weaviate/embedded.py +90 -0
  43. unstructured_ingest/v2/processes/connectors/weaviate/local.py +73 -0
  44. unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +289 -0
  45. {unstructured_ingest-0.3.0.dist-info → unstructured_ingest-0.3.1.dist-info}/METADATA +15 -15
  46. {unstructured_ingest-0.3.0.dist-info → unstructured_ingest-0.3.1.dist-info}/RECORD +50 -30
  47. unstructured_ingest/v2/processes/connectors/weaviate.py +0 -242
  48. {unstructured_ingest-0.3.0.dist-info → unstructured_ingest-0.3.1.dist-info}/LICENSE.md +0 -0
  49. {unstructured_ingest-0.3.0.dist-info → unstructured_ingest-0.3.1.dist-info}/WHEEL +0 -0
  50. {unstructured_ingest-0.3.0.dist-info → unstructured_ingest-0.3.1.dist-info}/entry_points.txt +0 -0
  51. {unstructured_ingest-0.3.0.dist-info → unstructured_ingest-0.3.1.dist-info}/top_level.txt +0 -0
@@ -5,27 +5,36 @@ test/integration/chunkers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJW
5
5
  test/integration/chunkers/test_chunkers.py,sha256=pqn1Rqh36jZTJL4qpU0iuOMFAEQ-LrKAPOgWtQMAt_I,1482
6
6
  test/integration/connectors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
7
  test/integration/connectors/conftest.py,sha256=6dVNMBrL6WIO4KXA-0nf2tNrPYk_tsor8uomi6fbi3Q,727
8
- test/integration/connectors/test_astradb.py,sha256=Zy0xVOV60HOsvGKM4ankBE_X5ST37PBzR3iusk7DsEc,3492
8
+ test/integration/connectors/test_astradb.py,sha256=QPFrODXmOHagpuKaiooxXb3OEW93w2g4fmq8BkaBCnY,5303
9
9
  test/integration/connectors/test_azure_cog_search.py,sha256=dae4GifRiKue5YpsxworDiaMQoMsxcPDBithb6OFkx4,8876
10
10
  test/integration/connectors/test_confluence.py,sha256=xcPmZ_vi_pkCt-tUPn10P49FH9i_9YUbrAPO6fYk5rU,3521
11
- test/integration/connectors/test_delta_table.py,sha256=4_KPyQJpd6DmyIjjtXWPMw6NNf7xULRkxmqfbvmZ80g,5018
12
- test/integration/connectors/test_kafka.py,sha256=3-OtZFZ93aCfmP0fUJzHJG7BBOfM5uCOtCLVHarsnMs,5869
11
+ test/integration/connectors/test_delta_table.py,sha256=GSzWIkbEUzOrRPt2F1uO0dabcp7kTFDj75BhhI2y-WU,6856
12
+ test/integration/connectors/test_kafka.py,sha256=j7jsNWZumNBv9v-5Bpx8geUUXpxxad5EuA4CMRsl4R8,7104
13
+ test/integration/connectors/test_lancedb.py,sha256=1EqdXOaA3gJqXDe1W-dHUzfOfeL1A4RB0oYwKvlfltg,7590
14
+ test/integration/connectors/test_milvus.py,sha256=CVmYw9iEeKT_0OtShxye2E6i1LbWzzDA8JtwJRkYQlA,4763
13
15
  test/integration/connectors/test_onedrive.py,sha256=KIkBwKh1hnv203VCL2UABnDkS_bP4NxOFm1AL8EPGLA,3554
14
- test/integration/connectors/test_pinecone.py,sha256=809YADKRrdYnoXAd7HYaNCP3XJG7nb24NzOJkNu44nI,5535
16
+ test/integration/connectors/test_pinecone.py,sha256=X10OWZ6IrO6YyhuR3ydMAZOQq3u2f5u_lCjKNYUUcnI,7558
15
17
  test/integration/connectors/test_qdrant.py,sha256=ASvO-BNyhv8m8or28KljrJy27Da0uaTNeoR5w_QsvFg,5121
16
18
  test/integration/connectors/test_s3.py,sha256=YHEYMqWTKTfR7wlL4VoxtgMs1YiYKyhLIBdG-anaQGo,6896
17
19
  test/integration/connectors/databricks_tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
20
  test/integration/connectors/databricks_tests/test_volumes_native.py,sha256=k4lALbwNtlyuI3wd3OHoBULI21E3Ck2Fo8EJXaVfwgw,5812
21
+ test/integration/connectors/elasticsearch/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
22
+ test/integration/connectors/elasticsearch/conftest.py,sha256=-i4_7MkIxSQENz7nuD2uHuhGU9mZ33vpeTPhHtRpQfs,989
23
+ test/integration/connectors/elasticsearch/test_elasticsearch.py,sha256=nqdHwBpvgk_74orzDaQIKALK5cb0YloxSdt7QDJX0r0,11169
24
+ test/integration/connectors/elasticsearch/test_opensearch.py,sha256=Rk4tQ_Qv5icycDWMUpnzTbg-QzwGyb6nKqB0gDef9D0,10555
19
25
  test/integration/connectors/sql/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
20
- test/integration/connectors/sql/test_postgres.py,sha256=gDBuNyvWmpVPmDrSSYC99z3t17B_a196P1MwIAOp5Dk,6584
21
- test/integration/connectors/sql/test_singlestore.py,sha256=wGI3-lc6qh0qN4-WD9VtiXBB9MlekeqK402_9EXQyX0,5876
22
- test/integration/connectors/sql/test_snowflake.py,sha256=XXU2-2z_k8jHWP684v2IuaGOlV3cmPpg3RxkwMp08v8,6998
23
- test/integration/connectors/sql/test_sqlite.py,sha256=51QrFufAq-XxNjHAkmPWxdJUkGdIRRIGKeRT09A5pkA,5704
26
+ test/integration/connectors/sql/test_postgres.py,sha256=lrymDI7bVX_4qij5gsUc_bTvHPeelu6hpJemQ6WWmlY,6783
27
+ test/integration/connectors/sql/test_singlestore.py,sha256=iCp9q6tzhNIUCUubCPiRKj6VmJnwot4JGo9fkkTHg_U,5960
28
+ test/integration/connectors/sql/test_snowflake.py,sha256=DqQIV9H5Uv7HaHtDyrAPdqefd316oVt5lKtdJ2Zdk6Q,7082
29
+ test/integration/connectors/sql/test_sqlite.py,sha256=gSfp2hXAb5BGknzZXVa7K5bBwEb5Li4k5493mQCFjBQ,5719
24
30
  test/integration/connectors/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
25
31
  test/integration/connectors/utils/constants.py,sha256=0zSPnsZVqJuNhXduXvdXFQLZTRIQa5Fo_1qjBYVCfb8,209
26
- test/integration/connectors/utils/docker.py,sha256=JxfX8u46YwpqUnVGd4syI0SrqGqvGQx9yBN0Xq-bIKE,2328
32
+ test/integration/connectors/utils/docker.py,sha256=lnSjRgYoQa5c5nBdg2eLkB8KJVOjk4eyqq_C6PtTkME,4806
27
33
  test/integration/connectors/utils/docker_compose.py,sha256=GVTB6Cel05c0VQ2n4AwkQQx_cBfz13ZTs1HpbaYipNU,2223
28
- test/integration/connectors/utils/validation.py,sha256=5rQOBJyu1etvuwJmkH6xvKUPF08AKwJRxlN4L7-nw9w,13894
34
+ test/integration/connectors/utils/validation.py,sha256=SwvPVuHjJxTo8xEUwnuL9FZNpu3sZZ8iouOz5xh_kB8,14272
35
+ test/integration/connectors/weaviate/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
36
+ test/integration/connectors/weaviate/conftest.py,sha256=6Q6QdrLJmGHowRFSmoVSzup2EX6qASfS2Z5tqlpTm9M,387
37
+ test/integration/connectors/weaviate/test_local.py,sha256=SK6iEwQUKiCd0X99BEk8GlQoLaCcJcFPt09NN526Ct0,4508
29
38
  test/integration/embedders/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
30
39
  test/integration/embedders/conftest.py,sha256=B2W771RbijR7G_GybsCzRyIvOzXqzbKZdRIlNDd5AGY,334
31
40
  test/integration/embedders/test_bedrock.py,sha256=0oBRNS_DtFDGQ22Z1T3t6VOJ31PrItgvnJpqcLe9Fg4,1903
@@ -71,7 +80,7 @@ test/unit/v2/partitioners/test_partitioner.py,sha256=iIYg7IpftV3LusoO4H8tr1IHY1U
71
80
  test/unit/v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
72
81
  test/unit/v2/utils/data_generator.py,sha256=UoYVNjG4S4wlaA9gceQ82HIpF9_6I1UTHD1_GrQBHp0,973
73
82
  unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
74
- unstructured_ingest/__version__.py,sha256=lWtlg90A2bUoi9oMXDJVdgZ8UO2vchSsWKV19YBO4f0,42
83
+ unstructured_ingest/__version__.py,sha256=0bjUtHIzwwONNua74ouSySVzVv9qumqBMBxOWLE7Tbo,42
75
84
  unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
76
85
  unstructured_ingest/interfaces.py,sha256=OYVUP0bzBJpT-Lz92BDyz_hLBvyfxkuSwWHhUdnUayA,31493
77
86
  unstructured_ingest/logger.py,sha256=S5nSqGcABoQyeicgRnBQFjDScCaTvFVivOCvbo-laL0,4479
@@ -267,7 +276,7 @@ unstructured_ingest/pipeline/utils.py,sha256=RNx4bv2FhKOhaK_YTiRubta7n9wmJwqzznF
267
276
  unstructured_ingest/pipeline/write.py,sha256=xmDjmbieGRrcI342he7PkgxWaMoSJ5nWPmP5AM2xloU,669
268
277
  unstructured_ingest/pipeline/reformat/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
269
278
  unstructured_ingest/pipeline/reformat/chunking.py,sha256=vbJgi2Yl9Rd9yZxIf64Nxj6cjUJnJWRpDCagswQmrLw,6040
270
- unstructured_ingest/pipeline/reformat/embedding.py,sha256=ohNvW9MhVGKVCx8ZlnLlLgkFQ_6UYLA7yUwT7Bzj9I8,2522
279
+ unstructured_ingest/pipeline/reformat/embedding.py,sha256=vyRgrNvz50eMOCO00YdV9ODK0LRIB3_NF6t1mWD01uc,2525
271
280
  unstructured_ingest/runner/__init__.py,sha256=FO0X_jBIMilXdyjBajyFmzHoC3eVypNMGlhdOW4mcCM,2859
272
281
  unstructured_ingest/runner/airtable.py,sha256=1ndJ6PKT63E0gZN3KYFBj4Yo94zQYsIvSjC6ro2nIPE,1115
273
282
  unstructured_ingest/runner/astradb.py,sha256=FSBtQrsdC9E3eHUcAuQ0apcCnWolz-9tkvy-Uf7QeKg,1102
@@ -329,7 +338,7 @@ unstructured_ingest/runner/writers/fsspec/s3.py,sha256=kHJq2O3864QBd_tL2SKb0mdyw
329
338
  unstructured_ingest/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
330
339
  unstructured_ingest/utils/chunking.py,sha256=efWEfMcCukG5zASZrXhkNgAX8AzHa6t3rClMzm2TwFE,1521
331
340
  unstructured_ingest/utils/compression.py,sha256=NNiY-2S2Gf3at7zC1PYxMijaEza9vVSzRn5mdFf6mHo,4434
332
- unstructured_ingest/utils/data_prep.py,sha256=9UKewDHB8-cMlQ8POvokhjVsy-ksiSqAAW2ibqPYAfk,4400
341
+ unstructured_ingest/utils/data_prep.py,sha256=IDAedOSBdgZpD9IY4tLJT-rmKGV7GHtU6KRj6VM-_tE,4666
333
342
  unstructured_ingest/utils/dep_check.py,sha256=SXXcUna2H0RtxA6j1S2NGkvQa9JP2DujWhmyBa7776Y,2400
334
343
  unstructured_ingest/utils/google_filetype.py,sha256=YVspEkiiBrRUSGVeVbsavvLvTmizdy2e6TsjigXTSRU,468
335
344
  unstructured_ingest/utils/string_and_date_utils.py,sha256=LwcbLmWpwt1zEabLlyUd5kIf9oOWcZxsRzxDglLCMeU,1375
@@ -382,36 +391,36 @@ unstructured_ingest/v2/processes/embedder.py,sha256=PQn0IO8xbGRQHpcT2VVl-J8gTJ5H
382
391
  unstructured_ingest/v2/processes/filter.py,sha256=kjUmMw2SDq2bme0JCAOxs6cJriIG6Ty09KOznS-xz08,2145
383
392
  unstructured_ingest/v2/processes/partitioner.py,sha256=agpHwB9FR8OZVQqE7zFEb0IcDPCOPA_BZjLzLF71nOY,8194
384
393
  unstructured_ingest/v2/processes/uncompress.py,sha256=Z_XfsITGdyaRwhtNUc7bMj5Y2jLuBge8KoK4nxhqKag,2425
385
- unstructured_ingest/v2/processes/connectors/__init__.py,sha256=ORSxrryPZErHAZTC3sp3UhWCh3G1B2SzTIM4H4OdVCc,5862
394
+ unstructured_ingest/v2/processes/connectors/__init__.py,sha256=8M3aYYNbOkS2SYG2B_HLHMgX4V69-Oz1VqpQcRQMiVg,5167
386
395
  unstructured_ingest/v2/processes/connectors/airtable.py,sha256=eeZJe-bBNxt5Sa-XEFCdcGeJCguJU5WN2Mv9kLp5dVQ,8917
387
- unstructured_ingest/v2/processes/connectors/astradb.py,sha256=n5RT1l8pHbZG7m-CLKhWGCuWgfpeuIzvOZv7UAmTE6c,14683
388
- unstructured_ingest/v2/processes/connectors/azure_ai_search.py,sha256=5EXu565yDxjg7Iz0PO2mljwPnZVGYuWomNsbnMUOW_I,11813
396
+ unstructured_ingest/v2/processes/connectors/astradb.py,sha256=zsIElFNNqVCXcLqBw6C8bRoyPQDrGNPkTWeA0FYYO94,14703
397
+ unstructured_ingest/v2/processes/connectors/azure_ai_search.py,sha256=-6IijSWGqj-85vD0c4l5wdMHp-LF371jO8j53PPRB4I,12002
389
398
  unstructured_ingest/v2/processes/connectors/chroma.py,sha256=skrxRPHZ8y3JxNa0dt5SVitHiDQ5WVxLvY_kh2-QUrQ,8029
390
399
  unstructured_ingest/v2/processes/connectors/confluence.py,sha256=qQApDcmPBGg4tHXwSOj4JPkAbrO9GQ4NRlaETjhp25U,7003
391
400
  unstructured_ingest/v2/processes/connectors/couchbase.py,sha256=yhMDbpkZXs-Kis7tFlgjvNemU-MdWMdpCZDrpZNFaU4,12180
392
- unstructured_ingest/v2/processes/connectors/delta_table.py,sha256=ckdM6Z_hcltbtHdgkPi7_wntUvZSumAt7eQCxbmM4rQ,7480
393
- unstructured_ingest/v2/processes/connectors/elasticsearch.py,sha256=lNvUbbTMv2ZKxRN6cesfD2AeQc1kQG9AKqY9RHBfVXs,16796
401
+ unstructured_ingest/v2/processes/connectors/delta_table.py,sha256=1yS7ivEyiucwd_kv6LL5HQdGabT43yeG6XCdwiz89hc,8019
394
402
  unstructured_ingest/v2/processes/connectors/gitlab.py,sha256=yBgCeLy9iCVI8bBDcHHuHB0H3BO05e9E1OccbHwvKAo,9724
395
403
  unstructured_ingest/v2/processes/connectors/google_drive.py,sha256=a1BAvhX3nsgghjuR5CJ1lOwMtJ5ZJwimg6VtDYvluxA,13104
396
404
  unstructured_ingest/v2/processes/connectors/kdbai.py,sha256=8bGHbZctJ_Tl1AUSMnI7CCZ7CgEtTRVcRuvlB1HPlqQ,5907
397
405
  unstructured_ingest/v2/processes/connectors/local.py,sha256=a3stgnIkhBbXPIQD0O-RaRM-Eb-szHj9Yy4Fz881-9c,6723
398
- unstructured_ingest/v2/processes/connectors/milvus.py,sha256=ZUlyAQyTt0U1JoapFYHQW3IIaGYY50b3URDSLEAFjtk,7687
406
+ unstructured_ingest/v2/processes/connectors/milvus.py,sha256=Bzv2fa852BcM4_Pr-I_DPvLmjPoXv0Z7BeEA8qSKCDc,9725
399
407
  unstructured_ingest/v2/processes/connectors/mongodb.py,sha256=A0pt6JcNTD5bEu79jZ8KhnHcBQ2VUJ2AjtQAtdFr_Lo,13175
400
408
  unstructured_ingest/v2/processes/connectors/onedrive.py,sha256=heZMtOIrCySi552ldIk8iH0pSRXZ0W2LeD-CcNOwCFQ,15979
401
- unstructured_ingest/v2/processes/connectors/opensearch.py,sha256=dfDSNrWIEk19wuHdlMJpp_SLMOteNPlkDBPlAwu1LVY,6767
402
409
  unstructured_ingest/v2/processes/connectors/outlook.py,sha256=KgNGM8hImRhy6_SpswRP2VwRD4VOrqqJoySgxf2oduI,9290
403
- unstructured_ingest/v2/processes/connectors/pinecone.py,sha256=Fh7G0gam49HSxn6SoWIIgqYTBKkY34u6LzjZmJB7fMI,10762
410
+ unstructured_ingest/v2/processes/connectors/pinecone.py,sha256=hWkXgVDAzCtrBxf7A4HoexBACGAfVf_Qvn9YHbeiBSY,11505
404
411
  unstructured_ingest/v2/processes/connectors/salesforce.py,sha256=2CiO2ZZiZ1Y1-nB7wcDlDVcpW2B7ut9wCj66rkkqho0,11616
405
412
  unstructured_ingest/v2/processes/connectors/sharepoint.py,sha256=Ndn2Wm7RupfjAtlLxxQwJueeE0V8aGMbNVPuFq9nqdQ,19730
406
413
  unstructured_ingest/v2/processes/connectors/slack.py,sha256=Z73VmQ3oUY09KoLEi5OBdQeDt4ONEY_02SglWQc6HXE,9252
407
414
  unstructured_ingest/v2/processes/connectors/utils.py,sha256=8kd0g7lo9NqnpaIkjeO-Ut6erhwUNH_gS9koevpe3WE,878
408
- unstructured_ingest/v2/processes/connectors/weaviate.py,sha256=Ss0YyD5T6k-00eJ6dr5lSo2H0LcOjVTMmozehyTvnAo,8866
409
415
  unstructured_ingest/v2/processes/connectors/databricks/__init__.py,sha256=jO71UTC7bLA_N12CrLWJzh_yZML5gfT7VohxzCpUGWg,1848
410
416
  unstructured_ingest/v2/processes/connectors/databricks/volumes.py,sha256=p7sjCYZb7JmY3v3Xy1gm-q0O7oamLTsSFf2EWXYfXYQ,6447
411
417
  unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py,sha256=I1MJwe5LOxoPLjwo00H0XbXO6u_SJHWYgsj4s6ePoyI,2754
412
418
  unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py,sha256=P4rfcE3td7WyuuguRgUnGQytCMDpfeYrrpshBZuVynY,3539
413
419
  unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py,sha256=UUotY_-HpgSEJkvdQfZTlbxY7CRLZ4ctL8TlryeFvxk,2790
414
420
  unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py,sha256=Wk7s2_u5G0BOV5slvGc8IlUf7ivznY9PrgPqe6nlJKM,2897
421
+ unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py,sha256=Zzc0JNPP-eFqpwWw1Gp-XC8H-s__IgkYKzoagECycZY,829
422
+ unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py,sha256=lzbrQ66zz3Dh_G29XFkyzQ84St8H_xfQVsYV4mTf32c,19141
423
+ unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py,sha256=qRz8Fyr2RSZIPZGkhPeme6AZxM0aX-c_xOa1ZtSr2Kg,6781
415
424
  unstructured_ingest/v2/processes/connectors/fsspec/__init__.py,sha256=TtdeImM7Ypl_n6sl7I1JqX6bGSG0t_FqvCqE3Cy24og,1846
416
425
  unstructured_ingest/v2/processes/connectors/fsspec/azure.py,sha256=Y01BuVRql0Kvzc_cdaZE9dDGYjJzrwJu-etfUrEGcUU,7061
417
426
  unstructured_ingest/v2/processes/connectors/fsspec/box.py,sha256=Cjk0LUxqOCDbme0GmnD_5_b1hfStjI23cKw6BquKNrg,5488
@@ -423,8 +432,14 @@ unstructured_ingest/v2/processes/connectors/fsspec/sftp.py,sha256=dwpyqDq0qceCBW
423
432
  unstructured_ingest/v2/processes/connectors/fsspec/utils.py,sha256=jec_Qfe2hbfahBuY-u8FnvHuv933AI5HwPFjOL3kEEY,456
424
433
  unstructured_ingest/v2/processes/connectors/kafka/__init__.py,sha256=mQJ9Ex-QCfhz-BB5YWTfbPf7xGLd1i7FpjRr0ukbhNw,754
425
434
  unstructured_ingest/v2/processes/connectors/kafka/cloud.py,sha256=qprsfI8VH0mVTa1MOCpa2D4coyopinQ5ag2KXcAecXE,3296
426
- unstructured_ingest/v2/processes/connectors/kafka/kafka.py,sha256=BkbozVTrDBingDuH8gTRiF5rceHoM1D3eibhl1pKgZQ,9092
435
+ unstructured_ingest/v2/processes/connectors/kafka/kafka.py,sha256=qEv_yaG94KekFtfS06KgpTTbqeJkje0hn5uOjsMMngw,9414
427
436
  unstructured_ingest/v2/processes/connectors/kafka/local.py,sha256=vwLZjvc_C17zOqcrzic0aIoPwS98sqYiwiMknw2IcK4,2586
437
+ unstructured_ingest/v2/processes/connectors/lancedb/__init__.py,sha256=lHUPCOiyOGu1IME1QiyFBZaB8z8e3bP8Y8TkqKs32Qk,906
438
+ unstructured_ingest/v2/processes/connectors/lancedb/aws.py,sha256=yR8V4O-oI_nUKJtHTLxhteEJpPDPn-_d2IkkXvgThJ0,1406
439
+ unstructured_ingest/v2/processes/connectors/lancedb/azure.py,sha256=Ms5vQVRIpTF1Q2qBl_bET9wbgaf4diPaH-iR8kJlr4E,1461
440
+ unstructured_ingest/v2/processes/connectors/lancedb/gcp.py,sha256=p5BPaFtS3y3Yh8PIr3tUqsAXrUYu4QYYAWQNh5W2ucE,1361
441
+ unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py,sha256=7WIShs2V3dpN6wUhDTt1j2rvdiPp6yopbh7XYkb9T3s,5129
442
+ unstructured_ingest/v2/processes/connectors/lancedb/local.py,sha256=_7-6iO6B60gAWwJUUrmlsRzYMFIBeZgu_QT3mhw5L0I,1272
428
443
  unstructured_ingest/v2/processes/connectors/qdrant/__init__.py,sha256=xM19uYzAuGizVoZIM_hnVZ5AcBN69aOBGpqZcpWPtuE,760
429
444
  unstructured_ingest/v2/processes/connectors/qdrant/cloud.py,sha256=accJ4sNWBVWV-KiVBDBDBYYx5A9CUoikP5NCErRmfik,1624
430
445
  unstructured_ingest/v2/processes/connectors/qdrant/local.py,sha256=cGEyv3Oy6y4BQ4DU8yhJWMpL82QYwBVdPTxxNuV127U,1588
@@ -434,11 +449,16 @@ unstructured_ingest/v2/processes/connectors/sql/__init__.py,sha256=D43wrV2ADvQsT
434
449
  unstructured_ingest/v2/processes/connectors/sql/postgres.py,sha256=rHaSb1MtdWMY6eQL2i2cWSL4w0VApFTChzmWtyfvFTI,5140
435
450
  unstructured_ingest/v2/processes/connectors/sql/singlestore.py,sha256=YrmhAL1RQ1c5-2fnR3UAyj_4KfvjYTQ2cWzpvsdJOnU,5535
436
451
  unstructured_ingest/v2/processes/connectors/sql/snowflake.py,sha256=jl524VudwmFK63emCT7DmZan_EWJAMiGir5_zoO9FuY,5697
437
- unstructured_ingest/v2/processes/connectors/sql/sql.py,sha256=Jwu3ZC4PGEw9la72cOwC3tclYAoBXFQTII9Mhh8ziP4,11571
452
+ unstructured_ingest/v2/processes/connectors/sql/sql.py,sha256=LFzGeAUagLknK07DsXg2oSG7ZAgR6VqT9wfI_tYlHUg,14782
438
453
  unstructured_ingest/v2/processes/connectors/sql/sqlite.py,sha256=9605K36nQ5-gBxzt1daYKYotON1SE85RETusqCJrbdk,5230
439
- unstructured_ingest-0.3.0.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
440
- unstructured_ingest-0.3.0.dist-info/METADATA,sha256=nn2t6UfzgYb6sr02uA_ixY-OQmcMwokknQ07Q9Kzdq0,7326
441
- unstructured_ingest-0.3.0.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
442
- unstructured_ingest-0.3.0.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
443
- unstructured_ingest-0.3.0.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
444
- unstructured_ingest-0.3.0.dist-info/RECORD,,
454
+ unstructured_ingest/v2/processes/connectors/weaviate/__init__.py,sha256=NMiwnVWan69KnzVELvaqX34tMhCytIa-C8EDsXVKsEo,856
455
+ unstructured_ingest/v2/processes/connectors/weaviate/cloud.py,sha256=2g1Fm2J0ppfy2jCw4b5YtrsWrSD3VcrAaqiE7FlpIAg,6236
456
+ unstructured_ingest/v2/processes/connectors/weaviate/embedded.py,sha256=S8Zg8StuZT-k7tCg1D5YShO1-vJYYk9-M1bE1fIqx64,3014
457
+ unstructured_ingest/v2/processes/connectors/weaviate/local.py,sha256=LuTBKPseVewsz8VqxRPRLfGEm3BeI9nBZxpy7ZU5tOA,2201
458
+ unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py,sha256=dBDC_M8GVKupl7i9UMRCZyRIUv6gTkq8bJE_SILydAc,11291
459
+ unstructured_ingest-0.3.1.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
460
+ unstructured_ingest-0.3.1.dist-info/METADATA,sha256=gEXBJbX1y03XJgGGqXpNlkOw1PJ4IhEHmohj2CXHq9g,7326
461
+ unstructured_ingest-0.3.1.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
462
+ unstructured_ingest-0.3.1.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
463
+ unstructured_ingest-0.3.1.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
464
+ unstructured_ingest-0.3.1.dist-info/RECORD,,
@@ -1,242 +0,0 @@
1
- import json
2
- from dataclasses import dataclass, field
3
- from datetime import date, datetime
4
- from pathlib import Path
5
- from typing import TYPE_CHECKING, Any, Optional
6
-
7
- from dateutil import parser
8
- from pydantic import Field, Secret
9
-
10
- from unstructured_ingest.error import DestinationConnectionError
11
- from unstructured_ingest.utils.dep_check import requires_dependencies
12
- from unstructured_ingest.v2.interfaces import (
13
- AccessConfig,
14
- ConnectionConfig,
15
- FileData,
16
- Uploader,
17
- UploaderConfig,
18
- UploadStager,
19
- UploadStagerConfig,
20
- )
21
- from unstructured_ingest.v2.logger import logger
22
- from unstructured_ingest.v2.processes.connector_registry import (
23
- DestinationRegistryEntry,
24
- )
25
-
26
- if TYPE_CHECKING:
27
- from weaviate import Client
28
-
29
- CONNECTOR_TYPE = "weaviate"
30
-
31
-
32
- class WeaviateAccessConfig(AccessConfig):
33
- access_token: Optional[str] = Field(
34
- default=None, description="Used to create the bearer token."
35
- )
36
- api_key: Optional[str] = None
37
- client_secret: Optional[str] = None
38
- password: Optional[str] = None
39
-
40
-
41
- class WeaviateConnectionConfig(ConnectionConfig):
42
- host_url: str = Field(description="Weaviate instance url")
43
- class_name: str = Field(
44
- description="Name of the class to push the records into, e.g: Pdf-elements"
45
- )
46
- access_config: Secret[WeaviateAccessConfig] = Field(
47
- default=WeaviateAccessConfig(), validate_default=True
48
- )
49
- username: Optional[str] = None
50
- anonymous: bool = Field(default=False, description="if set, all auth values will be ignored")
51
- scope: Optional[list[str]] = None
52
- refresh_token: Optional[str] = Field(
53
- default=None,
54
- description="Will tie this value to the bearer token. If not provided, "
55
- "the authentication will expire once the lifetime of the access token is up.",
56
- )
57
- connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
58
-
59
-
60
- class WeaviateUploadStagerConfig(UploadStagerConfig):
61
- pass
62
-
63
-
64
- @dataclass
65
- class WeaviateUploadStager(UploadStager):
66
- upload_stager_config: WeaviateUploadStagerConfig = field(
67
- default_factory=lambda: WeaviateUploadStagerConfig()
68
- )
69
-
70
- @staticmethod
71
- def parse_date_string(date_string: str) -> date:
72
- try:
73
- timestamp = float(date_string)
74
- return datetime.fromtimestamp(timestamp)
75
- except Exception as e:
76
- logger.debug(f"date {date_string} string not a timestamp: {e}")
77
- return parser.parse(date_string)
78
-
79
- @classmethod
80
- def conform_dict(cls, data: dict) -> None:
81
- """
82
- Updates the element dictionary to conform to the Weaviate schema
83
- """
84
-
85
- # Dict as string formatting
86
- if record_locator := data.get("metadata", {}).get("data_source", {}).get("record_locator"):
87
- # Explicit casting otherwise fails schema type checking
88
- data["metadata"]["data_source"]["record_locator"] = str(json.dumps(record_locator))
89
-
90
- # Array of items as string formatting
91
- if points := data.get("metadata", {}).get("coordinates", {}).get("points"):
92
- data["metadata"]["coordinates"]["points"] = str(json.dumps(points))
93
-
94
- if links := data.get("metadata", {}).get("links", {}):
95
- data["metadata"]["links"] = str(json.dumps(links))
96
-
97
- if permissions_data := (
98
- data.get("metadata", {}).get("data_source", {}).get("permissions_data")
99
- ):
100
- data["metadata"]["data_source"]["permissions_data"] = json.dumps(permissions_data)
101
-
102
- # Datetime formatting
103
- if date_created := data.get("metadata", {}).get("data_source", {}).get("date_created"):
104
- data["metadata"]["data_source"]["date_created"] = cls.parse_date_string(
105
- date_created
106
- ).strftime(
107
- "%Y-%m-%dT%H:%M:%S.%fZ",
108
- )
109
-
110
- if date_modified := data.get("metadata", {}).get("data_source", {}).get("date_modified"):
111
- data["metadata"]["data_source"]["date_modified"] = cls.parse_date_string(
112
- date_modified
113
- ).strftime(
114
- "%Y-%m-%dT%H:%M:%S.%fZ",
115
- )
116
-
117
- if date_processed := data.get("metadata", {}).get("data_source", {}).get("date_processed"):
118
- data["metadata"]["data_source"]["date_processed"] = cls.parse_date_string(
119
- date_processed
120
- ).strftime(
121
- "%Y-%m-%dT%H:%M:%S.%fZ",
122
- )
123
-
124
- if last_modified := data.get("metadata", {}).get("last_modified"):
125
- data["metadata"]["last_modified"] = cls.parse_date_string(last_modified).strftime(
126
- "%Y-%m-%dT%H:%M:%S.%fZ",
127
- )
128
-
129
- # String casting
130
- if version := data.get("metadata", {}).get("data_source", {}).get("version"):
131
- data["metadata"]["data_source"]["version"] = str(version)
132
-
133
- if page_number := data.get("metadata", {}).get("page_number"):
134
- data["metadata"]["page_number"] = str(page_number)
135
-
136
- if regex_metadata := data.get("metadata", {}).get("regex_metadata"):
137
- data["metadata"]["regex_metadata"] = str(json.dumps(regex_metadata))
138
-
139
- def run(
140
- self,
141
- elements_filepath: Path,
142
- file_data: FileData,
143
- output_dir: Path,
144
- output_filename: str,
145
- **kwargs: Any,
146
- ) -> Path:
147
- with open(elements_filepath) as elements_file:
148
- elements_contents = json.load(elements_file)
149
- for element in elements_contents:
150
- self.conform_dict(data=element)
151
- output_path = Path(output_dir) / Path(f"{output_filename}.json")
152
- with open(output_path, "w") as output_file:
153
- json.dump(elements_contents, output_file)
154
- return output_path
155
-
156
-
157
- class WeaviateUploaderConfig(UploaderConfig):
158
- batch_size: int = Field(default=100, description="Number of records per batch")
159
-
160
-
161
- @dataclass
162
- class WeaviateUploader(Uploader):
163
- upload_config: WeaviateUploaderConfig
164
- connection_config: WeaviateConnectionConfig
165
- connector_type: str = CONNECTOR_TYPE
166
-
167
- @requires_dependencies(["weaviate"], extras="weaviate")
168
- def get_client(self) -> "Client":
169
- from weaviate import Client
170
-
171
- auth = self._resolve_auth_method()
172
- return Client(url=self.connection_config.host_url, auth_client_secret=auth)
173
-
174
- def precheck(self) -> None:
175
- try:
176
- self.get_client()
177
- except Exception as e:
178
- logger.error(f"Failed to validate connection {e}", exc_info=True)
179
- raise DestinationConnectionError(f"failed to validate connection: {e}")
180
-
181
- @requires_dependencies(["weaviate"], extras="weaviate")
182
- def _resolve_auth_method(self):
183
- access_configs = self.connection_config.access_config.get_secret_value()
184
- connection_config = self.connection_config
185
- if connection_config.anonymous:
186
- return None
187
-
188
- if access_configs.access_token:
189
- from weaviate.auth import AuthBearerToken
190
-
191
- return AuthBearerToken(
192
- access_token=access_configs.access_token,
193
- refresh_token=connection_config.refresh_token,
194
- )
195
- elif access_configs.api_key:
196
- from weaviate.auth import AuthApiKey
197
-
198
- return AuthApiKey(api_key=access_configs.api_key)
199
- elif access_configs.client_secret:
200
- from weaviate.auth import AuthClientCredentials
201
-
202
- return AuthClientCredentials(
203
- client_secret=access_configs.client_secret, scope=connection_config.scope
204
- )
205
- elif connection_config.username and access_configs.password:
206
- from weaviate.auth import AuthClientPassword
207
-
208
- return AuthClientPassword(
209
- username=connection_config.username,
210
- password=access_configs.password,
211
- scope=connection_config.scope,
212
- )
213
- return None
214
-
215
- def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
216
- with path.open("r") as file:
217
- elements_dict = json.load(file)
218
- logger.info(
219
- f"writing {len(elements_dict)} objects to destination "
220
- f"class {self.connection_config.class_name} "
221
- f"at {self.connection_config.host_url}",
222
- )
223
-
224
- client = self.get_client()
225
- client.batch.configure(batch_size=self.upload_config.batch_size)
226
- with client.batch as b:
227
- for e in elements_dict:
228
- vector = e.pop("embeddings", None)
229
- b.add_data_object(
230
- e,
231
- self.connection_config.class_name,
232
- vector=vector,
233
- )
234
-
235
-
236
- weaviate_destination_entry = DestinationRegistryEntry(
237
- connection_config=WeaviateConnectionConfig,
238
- uploader=WeaviateUploader,
239
- uploader_config=WeaviateUploaderConfig,
240
- upload_stager=WeaviateUploadStager,
241
- upload_stager_config=WeaviateUploadStagerConfig,
242
- )