unstructured-ingest 0.1.1__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (39) hide show
  1. test/integration/connectors/conftest.py +13 -0
  2. test/integration/connectors/databricks_tests/test_volumes_native.py +8 -4
  3. test/integration/connectors/sql/test_postgres.py +6 -10
  4. test/integration/connectors/sql/test_singlestore.py +156 -0
  5. test/integration/connectors/sql/test_snowflake.py +205 -0
  6. test/integration/connectors/sql/test_sqlite.py +6 -10
  7. test/integration/connectors/test_delta_table.py +138 -0
  8. test/integration/connectors/test_s3.py +1 -1
  9. test/integration/connectors/utils/docker.py +78 -0
  10. test/integration/connectors/utils/docker_compose.py +23 -8
  11. test/integration/connectors/utils/validation.py +93 -2
  12. unstructured_ingest/__version__.py +1 -1
  13. unstructured_ingest/v2/cli/utils/click.py +32 -1
  14. unstructured_ingest/v2/cli/utils/model_conversion.py +10 -3
  15. unstructured_ingest/v2/interfaces/file_data.py +1 -0
  16. unstructured_ingest/v2/interfaces/indexer.py +4 -1
  17. unstructured_ingest/v2/pipeline/pipeline.py +10 -2
  18. unstructured_ingest/v2/pipeline/steps/index.py +18 -1
  19. unstructured_ingest/v2/processes/connectors/__init__.py +13 -6
  20. unstructured_ingest/v2/processes/connectors/astradb.py +278 -55
  21. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +3 -1
  22. unstructured_ingest/v2/processes/connectors/delta_table.py +185 -0
  23. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +1 -0
  24. unstructured_ingest/v2/processes/connectors/slack.py +248 -0
  25. unstructured_ingest/v2/processes/connectors/sql/__init__.py +15 -2
  26. unstructured_ingest/v2/processes/connectors/sql/postgres.py +33 -56
  27. unstructured_ingest/v2/processes/connectors/sql/singlestore.py +168 -0
  28. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +162 -0
  29. unstructured_ingest/v2/processes/connectors/sql/sql.py +51 -12
  30. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +31 -32
  31. unstructured_ingest/v2/unstructured_api.py +1 -1
  32. {unstructured_ingest-0.1.1.dist-info → unstructured_ingest-0.2.1.dist-info}/METADATA +19 -17
  33. {unstructured_ingest-0.1.1.dist-info → unstructured_ingest-0.2.1.dist-info}/RECORD +37 -31
  34. unstructured_ingest/v2/processes/connectors/databricks_volumes.py +0 -250
  35. unstructured_ingest/v2/processes/connectors/singlestore.py +0 -156
  36. {unstructured_ingest-0.1.1.dist-info → unstructured_ingest-0.2.1.dist-info}/LICENSE.md +0 -0
  37. {unstructured_ingest-0.1.1.dist-info → unstructured_ingest-0.2.1.dist-info}/WHEEL +0 -0
  38. {unstructured_ingest-0.1.1.dist-info → unstructured_ingest-0.2.1.dist-info}/entry_points.txt +0 -0
  39. {unstructured_ingest-0.1.1.dist-info → unstructured_ingest-0.2.1.dist-info}/top_level.txt +0 -0
@@ -4,17 +4,21 @@ test/integration/utils.py,sha256=CWqzEGw6TA_ZoP9hRUkW64TWYssooBbufcTRmbJvod8,401
4
4
  test/integration/chunkers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
5
  test/integration/chunkers/test_chunkers.py,sha256=pqn1Rqh36jZTJL4qpU0iuOMFAEQ-LrKAPOgWtQMAt_I,1482
6
6
  test/integration/connectors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
- test/integration/connectors/conftest.py,sha256=Q8ScDzrzO2o-8D_kYFt8LL7QAhoFTRRtKJKMc2hLMcI,345
8
- test/integration/connectors/test_s3.py,sha256=fK0soCTkNxp-4hm4O2LPrhlZXvYmaeTmeEgeNh1b0k8,5839
7
+ test/integration/connectors/conftest.py,sha256=6dVNMBrL6WIO4KXA-0nf2tNrPYk_tsor8uomi6fbi3Q,727
8
+ test/integration/connectors/test_delta_table.py,sha256=4_KPyQJpd6DmyIjjtXWPMw6NNf7xULRkxmqfbvmZ80g,5018
9
+ test/integration/connectors/test_s3.py,sha256=1ErPRpNmbg-88ig80SfIyxujF7xnAWtI42WSue4sgKU,5850
9
10
  test/integration/connectors/databricks_tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
- test/integration/connectors/databricks_tests/test_volumes_native.py,sha256=kS45mnNu9_U4qV3cxByEFXCYLEBWRy-fxxhzR3r93cs,5685
11
+ test/integration/connectors/databricks_tests/test_volumes_native.py,sha256=k4lALbwNtlyuI3wd3OHoBULI21E3Ck2Fo8EJXaVfwgw,5812
11
12
  test/integration/connectors/sql/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
- test/integration/connectors/sql/test_postgres.py,sha256=A9vWj5pBdoEyL2m6d3e2Ep8ZZcnLhdXkaHPPlkTStbg,6581
13
- test/integration/connectors/sql/test_sqlite.py,sha256=F6Ljb6npmFZlq_5pvJj-0Hkk2mC3T-pMAGyhDm1UtM4,5702
13
+ test/integration/connectors/sql/test_postgres.py,sha256=gDBuNyvWmpVPmDrSSYC99z3t17B_a196P1MwIAOp5Dk,6584
14
+ test/integration/connectors/sql/test_singlestore.py,sha256=wGI3-lc6qh0qN4-WD9VtiXBB9MlekeqK402_9EXQyX0,5876
15
+ test/integration/connectors/sql/test_snowflake.py,sha256=XXU2-2z_k8jHWP684v2IuaGOlV3cmPpg3RxkwMp08v8,6998
16
+ test/integration/connectors/sql/test_sqlite.py,sha256=51QrFufAq-XxNjHAkmPWxdJUkGdIRRIGKeRT09A5pkA,5704
14
17
  test/integration/connectors/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
18
  test/integration/connectors/utils/constants.py,sha256=0zSPnsZVqJuNhXduXvdXFQLZTRIQa5Fo_1qjBYVCfb8,209
16
- test/integration/connectors/utils/docker_compose.py,sha256=6XeYOKQFZCBRLEmcgH2mmBAaVs6R6jCWAhJLjq6p-aM,1771
17
- test/integration/connectors/utils/validation.py,sha256=Sf0ELATWG5K3E3d5S_ArtZeFFYdzoI5jN86U4DiqNyw,8422
19
+ test/integration/connectors/utils/docker.py,sha256=-wknXRVlzr3BVPdEhCyJgsdNjO9aSb2xjb-mQ306j7Q,2256
20
+ test/integration/connectors/utils/docker_compose.py,sha256=GVTB6Cel05c0VQ2n4AwkQQx_cBfz13ZTs1HpbaYipNU,2223
21
+ test/integration/connectors/utils/validation.py,sha256=gnflehoYbFkSBJdXQV-7HwcrlL_Cuqni2ri1YmArjT0,12019
18
22
  test/integration/embedders/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
23
  test/integration/embedders/conftest.py,sha256=B2W771RbijR7G_GybsCzRyIvOzXqzbKZdRIlNDd5AGY,334
20
24
  test/integration/embedders/test_bedrock.py,sha256=0oBRNS_DtFDGQ22Z1T3t6VOJ31PrItgvnJpqcLe9Fg4,1903
@@ -43,7 +47,7 @@ test/unit/embed/test_openai.py,sha256=0O1yshDcE0BMKv1yJqrNuiNLSdPhLpKqJ-D_wmnids
43
47
  test/unit/embed/test_vertexai.py,sha256=Pl7COc9E3tf_yGidkTEmTizNGyZF1F5zuL2TgPTMnfI,1048
44
48
  test/unit/embed/test_voyageai.py,sha256=DviCOJFhe5H4e26-kNyX3JNe8h3qB5Yl0KOe8rQEMrc,981
45
49
  unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
46
- unstructured_ingest/__version__.py,sha256=ch9Ch304-rlC6iFyomBT7OHb9bvtQNzaejmd5QwbzKE,42
50
+ unstructured_ingest/__version__.py,sha256=Hmm5OuicK0ynl_R5DSnpRYWJpEXwe7guJdsAMHH7K60,42
47
51
  unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
48
52
  unstructured_ingest/interfaces.py,sha256=m03BgenxSA34HbW157L7V9TGxK_dTG7N2AnAhF31W-U,31364
49
53
  unstructured_ingest/logger.py,sha256=S5nSqGcABoQyeicgRnBQFjDScCaTvFVivOCvbo-laL0,4479
@@ -310,7 +314,7 @@ unstructured_ingest/v2/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LG
310
314
  unstructured_ingest/v2/logger.py,sha256=wcln4s5Nyp2fjjJux9iM3d6t9aQFNJ2H1IAZXmIknjI,4323
311
315
  unstructured_ingest/v2/main.py,sha256=WFdLEqEXRy6E9_G-dF20MK2AtgX51Aan1sp_N67U2B8,172
312
316
  unstructured_ingest/v2/otel.py,sha256=2fGj1c7cVcC3J8NwL6MNYhyPEAXiB33DsilvRDkrdLo,4130
313
- unstructured_ingest/v2/unstructured_api.py,sha256=1EQVuL-TojmFxWoWFzXr1oCFPEC3IkCQqhjXM8spdTY,3373
317
+ unstructured_ingest/v2/unstructured_api.py,sha256=HqOaQ80YTdAnFj_2Ce108g7Pp3-F9Qg329Uw2OXtRmA,3375
314
318
  unstructured_ingest/v2/utils.py,sha256=BT_j15e4rX40wQbt8LUXbqfPhA3rJn1PHTI_G_A_EHY,1720
315
319
  unstructured_ingest/v2/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
316
320
  unstructured_ingest/v2/cli/cli.py,sha256=qHXIs-PcvMgDZhP1AR9iDMxh8FXBMJCEDksPBfiMULE,648
@@ -321,13 +325,13 @@ unstructured_ingest/v2/cli/base/dest.py,sha256=zDjqek7anr0JQ2ptEl8KIAsUXuCuHRnBQ
321
325
  unstructured_ingest/v2/cli/base/importer.py,sha256=nRt0QQ3qpi264-n_mR0l55C2ddM8nowTNzT1jsWaam8,1128
322
326
  unstructured_ingest/v2/cli/base/src.py,sha256=cpQ43qQju4e5s_YSaPxUtA70BaisRkTBdjtlPhqn5Mg,2872
323
327
  unstructured_ingest/v2/cli/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
324
- unstructured_ingest/v2/cli/utils/click.py,sha256=HCEcdHf8Lck0zcx3kidKjLbHDHXIBxPRL2MGgtKtDlg,6967
325
- unstructured_ingest/v2/cli/utils/model_conversion.py,sha256=uJQKpbTC5ysOdVaRq2SWEjG8btBimVZYzX9NVL7xnzs,7500
328
+ unstructured_ingest/v2/cli/utils/click.py,sha256=1_eJgrwS2DFBl1jZPLsj1vgVgR7agFBIEBe4A_n7mH4,7827
329
+ unstructured_ingest/v2/cli/utils/model_conversion.py,sha256=7eEIkk1KU51-ZNiIfI1KRxlwITNW1xl1YxMAG8BcTk0,7604
326
330
  unstructured_ingest/v2/interfaces/__init__.py,sha256=Rfa8crx6De7WNOK-EjsWWwFVpsUfCc6gY8B8tQ3ae9I,899
327
331
  unstructured_ingest/v2/interfaces/connector.py,sha256=qUFFJ3qgDMenTCZMtVRjq1DIwsVak6pxNjQOH2eVkMw,1623
328
332
  unstructured_ingest/v2/interfaces/downloader.py,sha256=Lj3nTY1hPA71GfNeedFVCdHdZsHLle8qrx5RtXAy9GY,2940
329
- unstructured_ingest/v2/interfaces/file_data.py,sha256=ieJK-hqHCEOmoYNGoFbCHziSaZyMtRS9VpSoYbwoKCE,1944
330
- unstructured_ingest/v2/interfaces/indexer.py,sha256=Bd1S-gTLsxhJBLEh1lYm_gXqwQLaEZMoqPq9yGxtN_E,713
333
+ unstructured_ingest/v2/interfaces/file_data.py,sha256=D71bXImJ7Pyjtl3I3pa2O2B2iBqIaY-mC-hdoEF3RmI,1983
334
+ unstructured_ingest/v2/interfaces/indexer.py,sha256=gsa1MLhFa82BzD2h4Yb7ons0VxRwKINZOrzvHAahwVU,846
331
335
  unstructured_ingest/v2/interfaces/process.py,sha256=BgglTu5K93FnDDopZKKr_rkK2LTZOguR6kcQjKHjF40,392
332
336
  unstructured_ingest/v2/interfaces/processor.py,sha256=VX7JqXlbG1plxMK8THWhWINPbTICaaUEk4XUXhnOixY,3303
333
337
  unstructured_ingest/v2/interfaces/upload_stager.py,sha256=ZFkDxcwKn-6EPrTbdBEgOkz1kGAq4gUtze98KP48KG4,1146
@@ -335,13 +339,13 @@ unstructured_ingest/v2/interfaces/uploader.py,sha256=JmZDl1blJa5rS61YHCae3Hfet84
335
339
  unstructured_ingest/v2/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
336
340
  unstructured_ingest/v2/pipeline/interfaces.py,sha256=-Y6gPnl-SbNxIx5-dQCmiYSPKUMjivrRlBLIKIUWVeM,8658
337
341
  unstructured_ingest/v2/pipeline/otel.py,sha256=K3pQvWVgWzyOWMKCBUofsH7wTZPJ0Ysw5sLjMBLW41I,1088
338
- unstructured_ingest/v2/pipeline/pipeline.py,sha256=x6hanD7Cj7Wd5MBUvb33UwXQMZxubzwlAiYyBCMukuc,15693
342
+ unstructured_ingest/v2/pipeline/pipeline.py,sha256=7Yg8_xwlSX6lA-oPGlTcn6KXZ9kc51zsoJxME5TiUlw,15956
339
343
  unstructured_ingest/v2/pipeline/steps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
340
344
  unstructured_ingest/v2/pipeline/steps/chunk.py,sha256=rYVcHSXeQSzWszg6VmtYlNc66Gsx-22Ti0BxPyQaJak,3135
341
345
  unstructured_ingest/v2/pipeline/steps/download.py,sha256=lzvOl5SoUK6OCCVVeG4CzdPIGj6eKKCGdciNo_0RMNk,8173
342
346
  unstructured_ingest/v2/pipeline/steps/embed.py,sha256=-YFvmchdsonWiSXxaD7PJfuUUtMLklaQM_8kZCQxCdM,3113
343
347
  unstructured_ingest/v2/pipeline/steps/filter.py,sha256=q7bNieaFMprqoF8Mx7w-ZN6jyA5peiGeTGyPtvcV-uw,1199
344
- unstructured_ingest/v2/pipeline/steps/index.py,sha256=nfDo-wt5sooKtMHKG7sI42m1L44uw-pxErDlDB1engw,2678
348
+ unstructured_ingest/v2/pipeline/steps/index.py,sha256=YUUf1sYZRZSrRgapca3Sfzk1sNPJ05yyTQ5wKlyDjEo,3543
345
349
  unstructured_ingest/v2/pipeline/steps/partition.py,sha256=9MQViptxK3ALKco8uE4gK9PpEoGq5JjzyU14C_18blU,3193
346
350
  unstructured_ingest/v2/pipeline/steps/stage.py,sha256=cphKgHScLz2rNLZRI5Olsb6dAH-MKGu3p6MYS1BEzkA,2246
347
351
  unstructured_ingest/v2/pipeline/steps/uncompress.py,sha256=CFSy4tGp6BAvF0oIwWFN8v4zFzh5pRDeESjEn5iP9hE,1756
@@ -353,13 +357,13 @@ unstructured_ingest/v2/processes/embedder.py,sha256=PQn0IO8xbGRQHpcT2VVl-J8gTJ5H
353
357
  unstructured_ingest/v2/processes/filter.py,sha256=kjUmMw2SDq2bme0JCAOxs6cJriIG6Ty09KOznS-xz08,2145
354
358
  unstructured_ingest/v2/processes/partitioner.py,sha256=2Lhztd730soVC2TOqrn_ba7CGZna8AHHpqJY2ZUYVxE,7776
355
359
  unstructured_ingest/v2/processes/uncompress.py,sha256=Z_XfsITGdyaRwhtNUc7bMj5Y2jLuBge8KoK4nxhqKag,2425
356
- unstructured_ingest/v2/processes/connectors/__init__.py,sha256=glyowqb93_NNreQXoRLbF0PvzMc6Ptv0ARfl3xfSH4E,4967
360
+ unstructured_ingest/v2/processes/connectors/__init__.py,sha256=zMO50wOGWOJrCTdh19Najj-i5tfMUyf977TKz4yN04A,5249
357
361
  unstructured_ingest/v2/processes/connectors/airtable.py,sha256=Yi7PEv_FejZ9_y3BPY3gu5YGVfeLh-9YX-qLyQHjJsY,8921
358
- unstructured_ingest/v2/processes/connectors/astradb.py,sha256=ZctZRfXcOAMBGPkKgHvhTmV_-2F0YN5vqwfY9UCHIlU,5791
362
+ unstructured_ingest/v2/processes/connectors/astradb.py,sha256=k6zaxm05-ESpRV6w1jgrtfE10-I2Z50kafURxxJVzdk,14043
359
363
  unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py,sha256=S55v7TXu30rEdgythMBB_2VcuomyMPmcPtLYykbhw_E,8466
360
364
  unstructured_ingest/v2/processes/connectors/chroma.py,sha256=skrxRPHZ8y3JxNa0dt5SVitHiDQ5WVxLvY_kh2-QUrQ,8029
361
365
  unstructured_ingest/v2/processes/connectors/couchbase.py,sha256=SONLywyEfoAlLc-HPabXeGzoiwKnekMHIbRMXd4CGXs,12146
362
- unstructured_ingest/v2/processes/connectors/databricks_volumes.py,sha256=BQHHpCDwE51inD3pZF4tL4zLr7lv6iBcwnA1NazrHqY,9423
366
+ unstructured_ingest/v2/processes/connectors/delta_table.py,sha256=ZZfdNTw1W0ISQGWCtM1JuIME26FYzuPBOqRKql0wlLg,7013
363
367
  unstructured_ingest/v2/processes/connectors/elasticsearch.py,sha256=ojxMUHkLa6ZG50aTGn2YWhDHZ1n38uFRn5p8_ghAIvM,16762
364
368
  unstructured_ingest/v2/processes/connectors/google_drive.py,sha256=7xOQthcqBd9auJxB0nxZlhh1vdjXpMX_CtQZa6YfZz0,13088
365
369
  unstructured_ingest/v2/processes/connectors/kdbai.py,sha256=8bGHbZctJ_Tl1AUSMnI7CCZ7CgEtTRVcRuvlB1HPlqQ,5907
@@ -372,11 +376,11 @@ unstructured_ingest/v2/processes/connectors/outlook.py,sha256=NK67Pd8Nk5oUIXTK-s
372
376
  unstructured_ingest/v2/processes/connectors/pinecone.py,sha256=k_GH55S_OQ6-wCLC6gkhRrNpXIFECYZ_2Gjz_XRtY6Y,7561
373
377
  unstructured_ingest/v2/processes/connectors/salesforce.py,sha256=2CiO2ZZiZ1Y1-nB7wcDlDVcpW2B7ut9wCj66rkkqho0,11616
374
378
  unstructured_ingest/v2/processes/connectors/sharepoint.py,sha256=hOaV5gBcHFc6N5Rbu3MgM-5Aol1ht-QkNIN4PqjvfxE,19665
375
- unstructured_ingest/v2/processes/connectors/singlestore.py,sha256=4rVvWKK2iQr03Ff6cB5zjfE1MpN0JyIGpCxxFCDI6hc,5563
379
+ unstructured_ingest/v2/processes/connectors/slack.py,sha256=b9IanzUApUexiJzuNg7PR3tujOoeG8dhM0L0v4MDuPw,9256
376
380
  unstructured_ingest/v2/processes/connectors/utils.py,sha256=8kd0g7lo9NqnpaIkjeO-Ut6erhwUNH_gS9koevpe3WE,878
377
381
  unstructured_ingest/v2/processes/connectors/weaviate.py,sha256=Ss0YyD5T6k-00eJ6dr5lSo2H0LcOjVTMmozehyTvnAo,8866
378
382
  unstructured_ingest/v2/processes/connectors/databricks/__init__.py,sha256=jO71UTC7bLA_N12CrLWJzh_yZML5gfT7VohxzCpUGWg,1848
379
- unstructured_ingest/v2/processes/connectors/databricks/volumes.py,sha256=db4PxE1LiKWUq0b9THABFRChArAfHps89pZBglqEg3c,6521
383
+ unstructured_ingest/v2/processes/connectors/databricks/volumes.py,sha256=8FasrRcoqa9zrhmnbfYN_rBBTH6xBXM50TzGsUMEm98,6581
380
384
  unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py,sha256=I1MJwe5LOxoPLjwo00H0XbXO6u_SJHWYgsj4s6ePoyI,2754
381
385
  unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py,sha256=P4rfcE3td7WyuuguRgUnGQytCMDpfeYrrpshBZuVynY,3539
382
386
  unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py,sha256=UUotY_-HpgSEJkvdQfZTlbxY7CRLZ4ctL8TlryeFvxk,2790
@@ -385,18 +389,20 @@ unstructured_ingest/v2/processes/connectors/fsspec/__init__.py,sha256=TtdeImM7Yp
385
389
  unstructured_ingest/v2/processes/connectors/fsspec/azure.py,sha256=Y01BuVRql0Kvzc_cdaZE9dDGYjJzrwJu-etfUrEGcUU,7061
386
390
  unstructured_ingest/v2/processes/connectors/fsspec/box.py,sha256=Cjk0LUxqOCDbme0GmnD_5_b1hfStjI23cKw6BquKNrg,5488
387
391
  unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py,sha256=NNAxIRdOQxUncfwhu7J7SnQRM6BSStNOyQZi-4E51iY,5816
388
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py,sha256=eFcrpSAB8wbLHuCiDb-2QpEUtgEEUA_iSqcT81H2-3Q,11472
392
+ unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py,sha256=5uZ_nGBXNQgwvfjNcor6mwzbYOHeja4-EV3nNCXvxaQ,11512
389
393
  unstructured_ingest/v2/processes/connectors/fsspec/gcs.py,sha256=-_pYHbsBG9FyRyNIaf_xyFbPiiR7pnWEEg_8mp0rIZ8,7053
390
394
  unstructured_ingest/v2/processes/connectors/fsspec/s3.py,sha256=je1BDqFWlyMfPa4oAMMNFQLLQtCY9quuqx3xjTwF8OQ,6251
391
395
  unstructured_ingest/v2/processes/connectors/fsspec/sftp.py,sha256=dwpyqDq0qceCBWX3zM1hiUlgXB4hzX6ObOr-sh-5CJs,6926
392
396
  unstructured_ingest/v2/processes/connectors/fsspec/utils.py,sha256=jec_Qfe2hbfahBuY-u8FnvHuv933AI5HwPFjOL3kEEY,456
393
- unstructured_ingest/v2/processes/connectors/sql/__init__.py,sha256=tr3SZH0tz04XSxqGRkUu__tL_0zn0bSms2jILE-3Rug,543
394
- unstructured_ingest/v2/processes/connectors/sql/postgres.py,sha256=oMwfYCycX-jTSKW-c6o6K09aU74Wn1B_G3Ib20oYi1A,6050
395
- unstructured_ingest/v2/processes/connectors/sql/sql.py,sha256=MbSvYSjhgGj8HHI7P-gH5bQ0Lqxtf8BEFsKNmCUfzug,9807
396
- unstructured_ingest/v2/processes/connectors/sql/sqlite.py,sha256=LxC2Q_rPHytbTDflmWzj4H5Jx-41phKnfp6FCpDe-UY,5701
397
- unstructured_ingest-0.1.1.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
398
- unstructured_ingest-0.1.1.dist-info/METADATA,sha256=LQ_M1kX7q7rGBvslwml9KbrJGJHAaA_SLWM64BBaZrg,7188
399
- unstructured_ingest-0.1.1.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
400
- unstructured_ingest-0.1.1.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
401
- unstructured_ingest-0.1.1.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
402
- unstructured_ingest-0.1.1.dist-info/RECORD,,
397
+ unstructured_ingest/v2/processes/connectors/sql/__init__.py,sha256=D43wrV2ADvQsToIYwbEWnZ7mhzlsYcZMFCqf6jIC7dQ,1333
398
+ unstructured_ingest/v2/processes/connectors/sql/postgres.py,sha256=__Wf5lkCQGhbtEH_2DxfNmQyWP-UKC9o_KEawG81jY0,4905
399
+ unstructured_ingest/v2/processes/connectors/sql/singlestore.py,sha256=YrmhAL1RQ1c5-2fnR3UAyj_4KfvjYTQ2cWzpvsdJOnU,5535
400
+ unstructured_ingest/v2/processes/connectors/sql/snowflake.py,sha256=0s0oBfMttPg5JL6jn8SsoCeTSRoXXdVy2bJAZv_hiSk,5576
401
+ unstructured_ingest/v2/processes/connectors/sql/sql.py,sha256=rWDkefUnYkzJT0mhIcHxieECdaIWLTvbDcOcZgLA4FQ,11636
402
+ unstructured_ingest/v2/processes/connectors/sql/sqlite.py,sha256=9605K36nQ5-gBxzt1daYKYotON1SE85RETusqCJrbdk,5230
403
+ unstructured_ingest-0.2.1.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
404
+ unstructured_ingest-0.2.1.dist-info/METADATA,sha256=NBV3OAonxt8Y0Tra7LWqQBoLSROwA106sf8vDCsXu2k,7271
405
+ unstructured_ingest-0.2.1.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
406
+ unstructured_ingest-0.2.1.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
407
+ unstructured_ingest-0.2.1.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
408
+ unstructured_ingest-0.2.1.dist-info/RECORD,,
@@ -1,250 +0,0 @@
1
- import os
2
- from dataclasses import dataclass
3
- from pathlib import Path
4
- from typing import TYPE_CHECKING, Any, Generator, Optional
5
-
6
- from pydantic import Field, Secret
7
-
8
- from unstructured_ingest.error import (
9
- DestinationConnectionError,
10
- SourceConnectionError,
11
- SourceConnectionNetworkError,
12
- )
13
- from unstructured_ingest.utils.dep_check import requires_dependencies
14
- from unstructured_ingest.v2.interfaces import (
15
- AccessConfig,
16
- ConnectionConfig,
17
- Downloader,
18
- DownloaderConfig,
19
- DownloadResponse,
20
- FileData,
21
- FileDataSourceMetadata,
22
- Indexer,
23
- IndexerConfig,
24
- SourceIdentifiers,
25
- Uploader,
26
- UploaderConfig,
27
- )
28
- from unstructured_ingest.v2.logger import logger
29
- from unstructured_ingest.v2.processes.connector_registry import (
30
- DestinationRegistryEntry,
31
- SourceRegistryEntry,
32
- )
33
-
34
- if TYPE_CHECKING:
35
- from databricks.sdk import WorkspaceClient
36
-
37
- CONNECTOR_TYPE = "databricks_volumes"
38
-
39
-
40
- class DatabricksVolumesAccessConfig(AccessConfig):
41
- account_id: Optional[str] = Field(
42
- default=None,
43
- description="The Databricks account ID for the Databricks "
44
- "accounts endpoint. Only has effect when Host is "
45
- "either https://accounts.cloud.databricks.com/ (AWS), "
46
- "https://accounts.azuredatabricks.net/ (Azure), "
47
- "or https://accounts.gcp.databricks.com/ (GCP).",
48
- )
49
- client_id: Optional[str] = Field(default=None, description="Client ID of the OAuth app.")
50
- client_secret: Optional[str] = Field(
51
- default=None, description="Client Secret of the OAuth app."
52
- )
53
- token: Optional[str] = Field(
54
- default=None,
55
- description="The Databricks personal access token (PAT) (AWS, Azure, and GCP) or "
56
- "Azure Active Directory (Azure AD) token (Azure).",
57
- )
58
- profile: Optional[str] = None
59
- azure_workspace_resource_id: Optional[str] = Field(
60
- default=None,
61
- description="The Azure Resource Manager ID for the Azure Databricks workspace, "
62
- "which is exchanged for a Databricks host URL.",
63
- )
64
- azure_client_secret: Optional[str] = Field(
65
- default=None, description="The Azure AD service principal’s client secret."
66
- )
67
- azure_client_id: Optional[str] = Field(
68
- default=None, description="The Azure AD service principal’s application ID."
69
- )
70
- azure_tenant_id: Optional[str] = Field(
71
- default=None, description="The Azure AD service principal’s tenant ID."
72
- )
73
- azure_environment: Optional[str] = Field(
74
- default=None,
75
- description="The Azure environment type for a " "specific set of API endpoints",
76
- examples=["Public", "UsGov", "China", "Germany"],
77
- )
78
- auth_type: Optional[str] = Field(
79
- default=None,
80
- description="When multiple auth attributes are available in the "
81
- "environment, use the auth type specified by this "
82
- "argument. This argument also holds the currently "
83
- "selected auth.",
84
- )
85
- google_credentials: Optional[str] = None
86
- google_service_account: Optional[str] = None
87
-
88
-
89
- class DatabricksVolumesConnectionConfig(ConnectionConfig):
90
- access_config: Secret[DatabricksVolumesAccessConfig] = Field(
91
- default=DatabricksVolumesAccessConfig(), validate_default=True
92
- )
93
- host: Optional[str] = Field(
94
- default=None,
95
- description="The Databricks host URL for either the "
96
- "Databricks workspace endpoint or the "
97
- "Databricks accounts endpoint.",
98
- )
99
- volume: str = Field(description="Name of volume in the Unity Catalog")
100
- catalog: str = Field(description="Name of the catalog in the Databricks Unity Catalog service")
101
- volume_path: Optional[str] = Field(
102
- default=None, description="Optional path within the volume to write to"
103
- )
104
- databricks_schema: str = Field(
105
- default="default",
106
- alias="schema",
107
- description="Schema associated with the volume to write to in the Unity Catalog service",
108
- )
109
-
110
- @property
111
- def path(self) -> str:
112
- path = f"/Volumes/{self.catalog}/{self.databricks_schema}/{self.volume}"
113
- if self.volume_path:
114
- path = f"{path}/{self.volume_path}"
115
- return path
116
-
117
- @requires_dependencies(dependencies=["databricks.sdk"], extras="databricks-volumes")
118
- def get_client(self) -> "WorkspaceClient":
119
- from databricks.sdk import WorkspaceClient
120
-
121
- return WorkspaceClient(
122
- host=self.host,
123
- **self.access_config.get_secret_value().model_dump(),
124
- )
125
-
126
-
127
- @dataclass
128
- class DatabricksVolumesIndexerConfig(IndexerConfig):
129
- recursive: bool = False
130
-
131
-
132
- @dataclass
133
- class DatabricksVolumesIndexer(Indexer):
134
- index_config: DatabricksVolumesIndexerConfig
135
- connection_config: DatabricksVolumesConnectionConfig
136
- connector_type: str = CONNECTOR_TYPE
137
-
138
- def precheck(self) -> None:
139
- try:
140
- self.connection_config.get_client()
141
- except Exception as e:
142
- logger.error(f"failed to validate connection: {e}", exc_info=True)
143
- raise SourceConnectionError(f"failed to validate connection: {e}")
144
-
145
- def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
146
- for file_info in self.connection_config.get_client().dbfs.list(
147
- path=self.connection_config.path, recursive=self.index_config.recursive
148
- ):
149
- if file_info.is_dir:
150
- continue
151
- rel_path = file_info.path.replace(self.connection_config.path, "")
152
- if rel_path.startswith("/"):
153
- rel_path = rel_path[1:]
154
- filename = Path(file_info.path).name
155
- yield FileData(
156
- identifier=file_info.path,
157
- connector_type=CONNECTOR_TYPE,
158
- source_identifiers=SourceIdentifiers(
159
- filename=filename,
160
- rel_path=rel_path,
161
- fullpath=file_info.path,
162
- ),
163
- additional_metadata={
164
- "catalog": self.connection_config.catalog,
165
- },
166
- metadata=FileDataSourceMetadata(
167
- url=file_info.path, date_modified=str(file_info.modification_time)
168
- ),
169
- )
170
-
171
-
172
- @dataclass
173
- class DatabricksVolumesDownloaderConfig(DownloaderConfig):
174
- pass
175
-
176
-
177
- @dataclass
178
- class DatabricksVolumesDownloader(Downloader):
179
- download_config: DatabricksVolumesDownloaderConfig
180
- connection_config: DatabricksVolumesConnectionConfig
181
- connector_type: str = CONNECTOR_TYPE
182
-
183
- def precheck(self) -> None:
184
- try:
185
- self.connection_config.get_client()
186
- except Exception as e:
187
- logger.error(f"failed to validate connection: {e}", exc_info=True)
188
- raise SourceConnectionError(f"failed to validate connection: {e}")
189
-
190
- def get_download_path(self, file_data: FileData) -> Path:
191
- return self.download_config.download_dir / Path(file_data.source_identifiers.relative_path)
192
-
193
- def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
194
- download_path = self.get_download_path(file_data=file_data)
195
- download_path.parent.mkdir(parents=True, exist_ok=True)
196
- logger.info(f"Writing {file_data.identifier} to {download_path}")
197
- try:
198
- with self.connection_config.get_client().dbfs.download(path=file_data.identifier) as c:
199
- read_content = c._read_handle.read()
200
- with open(download_path, "wb") as f:
201
- f.write(read_content)
202
- except Exception as e:
203
- logger.error(f"failed to download file {file_data.identifier}: {e}", exc_info=True)
204
- raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
205
-
206
- return self.generate_download_response(file_data=file_data, download_path=download_path)
207
-
208
-
209
- class DatabricksVolumesUploaderConfig(UploaderConfig):
210
- overwrite: bool = Field(
211
- default=False, description="If true, an existing file will be overwritten."
212
- )
213
-
214
-
215
- @dataclass
216
- class DatabricksVolumesUploader(Uploader):
217
- upload_config: DatabricksVolumesUploaderConfig
218
- connection_config: DatabricksVolumesConnectionConfig
219
- connector_type: str = CONNECTOR_TYPE
220
-
221
- def precheck(self) -> None:
222
- try:
223
- assert self.connection_config.get_client().current_user.me().active
224
- except Exception as e:
225
- logger.error(f"failed to validate connection: {e}", exc_info=True)
226
- raise DestinationConnectionError(f"failed to validate connection: {e}")
227
-
228
- def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
229
- output_path = os.path.join(self.connection_config.path, path.name)
230
- with open(path, "rb") as elements_file:
231
- self.connection_config.get_client().files.upload(
232
- file_path=output_path,
233
- contents=elements_file,
234
- overwrite=self.upload_config.overwrite,
235
- )
236
-
237
-
238
- databricks_volumes_destination_entry = DestinationRegistryEntry(
239
- connection_config=DatabricksVolumesConnectionConfig,
240
- uploader=DatabricksVolumesUploader,
241
- uploader_config=DatabricksVolumesUploaderConfig,
242
- )
243
-
244
- databricks_volumes_source_entry = SourceRegistryEntry(
245
- connection_config=DatabricksVolumesConnectionConfig,
246
- indexer=DatabricksVolumesIndexer,
247
- indexer_config=DatabricksVolumesIndexerConfig,
248
- downloader=DatabricksVolumesDownloader,
249
- downloader_config=DatabricksVolumesDownloaderConfig,
250
- )
@@ -1,156 +0,0 @@
1
- import json
2
- from dataclasses import dataclass
3
- from datetime import date, datetime
4
- from pathlib import Path
5
- from typing import TYPE_CHECKING, Any, Optional
6
-
7
- import numpy as np
8
- import pandas as pd
9
- from dateutil import parser
10
- from pydantic import Field, Secret
11
-
12
- from unstructured_ingest.utils.data_prep import batch_generator
13
- from unstructured_ingest.utils.dep_check import requires_dependencies
14
- from unstructured_ingest.utils.table import convert_to_pandas_dataframe
15
- from unstructured_ingest.v2.interfaces import (
16
- AccessConfig,
17
- ConnectionConfig,
18
- FileData,
19
- Uploader,
20
- UploaderConfig,
21
- UploadStager,
22
- UploadStagerConfig,
23
- )
24
- from unstructured_ingest.v2.logger import logger
25
- from unstructured_ingest.v2.processes.connector_registry import (
26
- DestinationRegistryEntry,
27
- )
28
-
29
- if TYPE_CHECKING:
30
- from singlestoredb.connection import Connection
31
-
32
- CONNECTOR_TYPE = "singlestore"
33
-
34
-
35
- class SingleStoreAccessConfig(AccessConfig):
36
- password: Optional[str] = Field(default=None, description="SingleStore password")
37
-
38
-
39
- class SingleStoreConnectionConfig(ConnectionConfig):
40
- host: Optional[str] = Field(default=None, description="SingleStore host")
41
- port: Optional[int] = Field(default=None, description="SingleStore port")
42
- user: Optional[str] = Field(default=None, description="SingleStore user")
43
- database: Optional[str] = Field(default=None, description="SingleStore database")
44
- access_config: Secret[SingleStoreAccessConfig]
45
-
46
- @requires_dependencies(["singlestoredb"], extras="singlestore")
47
- def get_connection(self) -> "Connection":
48
- import singlestoredb as s2
49
-
50
- conn = s2.connect(
51
- host=self.host,
52
- port=self.port,
53
- database=self.database,
54
- user=self.user,
55
- password=self.access_config.get_secret_value().password,
56
- )
57
- return conn
58
-
59
-
60
- class SingleStoreUploadStagerConfig(UploadStagerConfig):
61
- drop_empty_cols: bool = Field(default=False, description="Drop any columns that have no data")
62
-
63
-
64
- @dataclass
65
- class SingleStoreUploadStager(UploadStager):
66
- upload_stager_config: SingleStoreUploadStagerConfig
67
-
68
- @staticmethod
69
- def parse_date_string(date_string: str) -> date:
70
- try:
71
- timestamp = float(date_string)
72
- return datetime.fromtimestamp(timestamp)
73
- except Exception as e:
74
- logger.debug(f"date {date_string} string not a timestamp: {e}")
75
- return parser.parse(date_string)
76
-
77
- def run(
78
- self,
79
- elements_filepath: Path,
80
- file_data: FileData,
81
- output_dir: Path,
82
- output_filename: str,
83
- **kwargs: Any,
84
- ) -> Path:
85
- with open(elements_filepath) as elements_file:
86
- elements_contents = json.load(elements_file)
87
- output_path = Path(output_dir) / Path(f"{output_filename}.csv")
88
- output_path.parent.mkdir(parents=True, exist_ok=True)
89
-
90
- df = convert_to_pandas_dataframe(
91
- elements_dict=elements_contents,
92
- drop_empty_cols=self.upload_stager_config.drop_empty_cols,
93
- )
94
- datetime_columns = [
95
- "data_source_date_created",
96
- "data_source_date_modified",
97
- "data_source_date_processed",
98
- ]
99
- for column in filter(lambda x: x in df.columns, datetime_columns):
100
- df[column] = df[column].apply(self.parse_date_string)
101
- if "data_source_record_locator" in df.columns:
102
- df["data_source_record_locator"] = df["data_source_record_locator"].apply(
103
- lambda x: json.dumps(x) if x else None
104
- )
105
-
106
- with output_path.open("w") as output_file:
107
- df.to_csv(output_file, index=False)
108
- return output_path
109
-
110
-
111
- class SingleStoreUploaderConfig(UploaderConfig):
112
- table_name: str = Field(description="SingleStore table to write contents to")
113
- batch_size: int = Field(default=100, description="Batch size when writing to SingleStore")
114
-
115
-
116
- @dataclass
117
- class SingleStoreUploader(Uploader):
118
- connection_config: SingleStoreConnectionConfig
119
- upload_config: SingleStoreUploaderConfig
120
- connector_type: str = CONNECTOR_TYPE
121
-
122
- def upload_csv(self, csv_path: Path) -> None:
123
- df = pd.read_csv(csv_path)
124
- logger.debug(
125
- f"uploading {len(df)} entries to {self.connection_config.database} "
126
- f"db in table {self.upload_config.table_name}"
127
- )
128
- stmt = "INSERT INTO {} ({}) VALUES ({})".format(
129
- self.upload_config.table_name,
130
- ", ".join(df.columns),
131
- ", ".join(["%s"] * len(df.columns)),
132
- )
133
- logger.debug(f"sql statement: {stmt}")
134
- df.replace({np.nan: None}, inplace=True)
135
- data_as_tuples = list(df.itertuples(index=False, name=None))
136
- with self.connection_config.get_connection() as conn:
137
- with conn.cursor() as cur:
138
- for chunk in batch_generator(
139
- data_as_tuples, batch_size=self.upload_config.batch_size
140
- ):
141
- cur.executemany(stmt, chunk)
142
- conn.commit()
143
-
144
- def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
145
- if path.suffix != ".csv":
146
- raise ValueError(f"Only .csv files are supported: {path}")
147
- self.upload_csv(csv_path=path)
148
-
149
-
150
- singlestore_destination_entry = DestinationRegistryEntry(
151
- connection_config=SingleStoreConnectionConfig,
152
- uploader=SingleStoreUploader,
153
- uploader_config=SingleStoreUploaderConfig,
154
- upload_stager=SingleStoreUploadStager,
155
- upload_stager_config=SingleStoreUploadStagerConfig,
156
- )