unstructured-ingest 0.0.2.dev0__py3-none-any.whl → 0.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (125) hide show
  1. unstructured_ingest/__version__.py +1 -1
  2. unstructured_ingest/cli/cli.py +6 -1
  3. unstructured_ingest/cli/cmds/__init__.py +4 -4
  4. unstructured_ingest/cli/cmds/{astra.py → astradb.py} +9 -9
  5. unstructured_ingest/cli/interfaces.py +13 -6
  6. unstructured_ingest/connector/{astra.py → astradb.py} +29 -29
  7. unstructured_ingest/connector/biomed.py +12 -5
  8. unstructured_ingest/connector/confluence.py +3 -3
  9. unstructured_ingest/connector/github.py +3 -2
  10. unstructured_ingest/connector/google_drive.py +1 -2
  11. unstructured_ingest/connector/mongodb.py +1 -2
  12. unstructured_ingest/connector/notion/client.py +31 -16
  13. unstructured_ingest/connector/notion/connector.py +3 -2
  14. unstructured_ingest/connector/registry.py +2 -2
  15. unstructured_ingest/connector/vectara.py +7 -2
  16. unstructured_ingest/interfaces.py +13 -9
  17. unstructured_ingest/pipeline/interfaces.py +8 -3
  18. unstructured_ingest/pipeline/reformat/chunking.py +13 -9
  19. unstructured_ingest/pipeline/reformat/embedding.py +3 -3
  20. unstructured_ingest/runner/__init__.py +2 -2
  21. unstructured_ingest/runner/{astra.py → astradb.py} +7 -7
  22. unstructured_ingest/runner/writers/__init__.py +2 -2
  23. unstructured_ingest/runner/writers/{astra.py → astradb.py} +7 -7
  24. unstructured_ingest/utils/chunking.py +45 -0
  25. unstructured_ingest/utils/dep_check.py +1 -1
  26. unstructured_ingest/utils/google_filetype.py +9 -0
  27. unstructured_ingest/v2/cli/base/cmd.py +66 -12
  28. unstructured_ingest/v2/cli/base/dest.py +21 -12
  29. unstructured_ingest/v2/cli/base/src.py +35 -21
  30. unstructured_ingest/v2/cli/cmds.py +14 -0
  31. unstructured_ingest/v2/cli/{utils.py → utils/click.py} +36 -89
  32. unstructured_ingest/v2/cli/utils/model_conversion.py +199 -0
  33. unstructured_ingest/v2/interfaces/__init__.py +2 -1
  34. unstructured_ingest/v2/interfaces/connector.py +5 -7
  35. unstructured_ingest/v2/interfaces/downloader.py +17 -8
  36. unstructured_ingest/v2/interfaces/file_data.py +13 -2
  37. unstructured_ingest/v2/interfaces/indexer.py +3 -4
  38. unstructured_ingest/v2/interfaces/process.py +3 -4
  39. unstructured_ingest/v2/interfaces/processor.py +10 -10
  40. unstructured_ingest/v2/interfaces/upload_stager.py +3 -3
  41. unstructured_ingest/v2/interfaces/uploader.py +3 -3
  42. unstructured_ingest/v2/pipeline/interfaces.py +3 -5
  43. unstructured_ingest/v2/pipeline/pipeline.py +73 -7
  44. unstructured_ingest/v2/pipeline/steps/chunk.py +5 -11
  45. unstructured_ingest/v2/pipeline/steps/download.py +90 -24
  46. unstructured_ingest/v2/pipeline/steps/embed.py +5 -11
  47. unstructured_ingest/v2/pipeline/steps/filter.py +35 -0
  48. unstructured_ingest/v2/pipeline/steps/index.py +14 -10
  49. unstructured_ingest/v2/pipeline/steps/partition.py +5 -5
  50. unstructured_ingest/v2/pipeline/steps/stage.py +4 -7
  51. unstructured_ingest/v2/pipeline/steps/uncompress.py +1 -6
  52. unstructured_ingest/v2/pipeline/steps/upload.py +2 -9
  53. unstructured_ingest/v2/processes/__init__.py +18 -0
  54. unstructured_ingest/v2/processes/chunker.py +74 -28
  55. unstructured_ingest/v2/processes/connector_registry.py +8 -2
  56. unstructured_ingest/v2/processes/connectors/__init__.py +13 -3
  57. unstructured_ingest/v2/processes/connectors/{astra.py → astradb.py} +53 -35
  58. unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +38 -27
  59. unstructured_ingest/v2/processes/connectors/chroma.py +38 -27
  60. unstructured_ingest/v2/processes/connectors/couchbase.py +151 -0
  61. unstructured_ingest/v2/processes/connectors/databricks_volumes.py +95 -31
  62. unstructured_ingest/v2/processes/connectors/elasticsearch.py +92 -53
  63. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +47 -16
  64. unstructured_ingest/v2/processes/connectors/fsspec/box.py +23 -13
  65. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +18 -11
  66. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +49 -61
  67. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +46 -13
  68. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +50 -20
  69. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +27 -28
  70. unstructured_ingest/v2/processes/connectors/google_drive.py +52 -42
  71. unstructured_ingest/v2/processes/connectors/local.py +36 -28
  72. unstructured_ingest/v2/processes/connectors/milvus.py +22 -18
  73. unstructured_ingest/v2/processes/connectors/mongodb.py +32 -22
  74. unstructured_ingest/v2/processes/connectors/onedrive.py +31 -16
  75. unstructured_ingest/v2/processes/connectors/opensearch.py +81 -43
  76. unstructured_ingest/v2/processes/connectors/pinecone.py +29 -23
  77. unstructured_ingest/v2/processes/connectors/salesforce.py +36 -26
  78. unstructured_ingest/v2/processes/connectors/sharepoint.py +64 -33
  79. unstructured_ingest/v2/processes/connectors/singlestore.py +11 -15
  80. unstructured_ingest/v2/processes/connectors/sql.py +52 -39
  81. unstructured_ingest/v2/processes/connectors/weaviate.py +35 -18
  82. unstructured_ingest/v2/processes/embedder.py +106 -47
  83. unstructured_ingest/v2/processes/filter.py +60 -0
  84. unstructured_ingest/v2/processes/partitioner.py +79 -33
  85. unstructured_ingest/v2/processes/uncompress.py +3 -3
  86. unstructured_ingest/v2/utils.py +45 -0
  87. unstructured_ingest-0.0.4.dist-info/METADATA +571 -0
  88. {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.4.dist-info}/RECORD +92 -116
  89. {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.4.dist-info}/WHEEL +1 -1
  90. unstructured_ingest/v2/cli/cmds/__init__.py +0 -89
  91. unstructured_ingest/v2/cli/cmds/astra.py +0 -85
  92. unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +0 -72
  93. unstructured_ingest/v2/cli/cmds/chroma.py +0 -108
  94. unstructured_ingest/v2/cli/cmds/databricks_volumes.py +0 -161
  95. unstructured_ingest/v2/cli/cmds/elasticsearch.py +0 -159
  96. unstructured_ingest/v2/cli/cmds/fsspec/azure.py +0 -84
  97. unstructured_ingest/v2/cli/cmds/fsspec/box.py +0 -58
  98. unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +0 -58
  99. unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +0 -77
  100. unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +0 -81
  101. unstructured_ingest/v2/cli/cmds/fsspec/s3.py +0 -84
  102. unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +0 -80
  103. unstructured_ingest/v2/cli/cmds/google_drive.py +0 -74
  104. unstructured_ingest/v2/cli/cmds/local.py +0 -60
  105. unstructured_ingest/v2/cli/cmds/milvus.py +0 -72
  106. unstructured_ingest/v2/cli/cmds/mongodb.py +0 -62
  107. unstructured_ingest/v2/cli/cmds/onedrive.py +0 -91
  108. unstructured_ingest/v2/cli/cmds/opensearch.py +0 -93
  109. unstructured_ingest/v2/cli/cmds/pinecone.py +0 -62
  110. unstructured_ingest/v2/cli/cmds/salesforce.py +0 -79
  111. unstructured_ingest/v2/cli/cmds/sharepoint.py +0 -112
  112. unstructured_ingest/v2/cli/cmds/singlestore.py +0 -96
  113. unstructured_ingest/v2/cli/cmds/sql.py +0 -84
  114. unstructured_ingest/v2/cli/cmds/weaviate.py +0 -100
  115. unstructured_ingest/v2/cli/configs/__init__.py +0 -6
  116. unstructured_ingest/v2/cli/configs/chunk.py +0 -89
  117. unstructured_ingest/v2/cli/configs/embed.py +0 -74
  118. unstructured_ingest/v2/cli/configs/partition.py +0 -99
  119. unstructured_ingest/v2/cli/configs/processor.py +0 -88
  120. unstructured_ingest/v2/cli/interfaces.py +0 -27
  121. unstructured_ingest/v2/pipeline/utils.py +0 -15
  122. unstructured_ingest-0.0.2.dev0.dist-info/METADATA +0 -321
  123. /unstructured_ingest/v2/cli/{cmds/fsspec → utils}/__init__.py +0 -0
  124. {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.4.dist-info}/entry_points.txt +0 -0
  125. {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.4.dist-info}/top_level.txt +0 -0
@@ -1,93 +0,0 @@
1
- from dataclasses import dataclass
2
-
3
- import click
4
-
5
- from unstructured_ingest.v2.cli.base import DestCmd, SrcCmd
6
- from unstructured_ingest.v2.cli.cmds.elasticsearch import (
7
- ElasticsearchCliDownloadConfig,
8
- ElasticsearchCliIndexerConfig,
9
- ElasticsearchCliUploadStagerConfig,
10
- ElasticsearchUploaderConfig,
11
- )
12
- from unstructured_ingest.v2.cli.interfaces import CliConfig
13
- from unstructured_ingest.v2.cli.utils import DelimitedString
14
- from unstructured_ingest.v2.processes.connectors.opensearch import CONNECTOR_TYPE
15
-
16
-
17
- @dataclass
18
- class OpenSearchCliConnectionConfig(CliConfig):
19
- @staticmethod
20
- def get_cli_options() -> list[click.Option]:
21
- options = [
22
- click.Option(
23
- ["--hosts"],
24
- type=DelimitedString(),
25
- help='List of the OpenSearch hosts to connect to, e.g. "http://localhost:9200"',
26
- ),
27
- click.Option(
28
- ["--username"], type=str, default=None, help="username when using basic auth"
29
- ),
30
- click.Option(
31
- ["--password"],
32
- type=str,
33
- default=None,
34
- help="password when using basic auth",
35
- ),
36
- click.Option(
37
- ["--use-ssl"],
38
- type=bool,
39
- default=False,
40
- is_flag=True,
41
- help="use ssl for the connection",
42
- ),
43
- click.Option(
44
- ["--verify-certs"],
45
- type=bool,
46
- default=False,
47
- is_flag=True,
48
- help="whether to verify SSL certificates",
49
- ),
50
- click.Option(
51
- ["--ssl-show-warn"],
52
- type=bool,
53
- default=False,
54
- is_flag=True,
55
- help="show warning when verify certs is disabled",
56
- ),
57
- click.Option(
58
- ["--ca-certs"],
59
- type=click.Path(),
60
- default=None,
61
- help="path to CA bundle",
62
- ),
63
- click.Option(
64
- ["--client-cert"],
65
- type=click.Path(),
66
- default=None,
67
- help="path to the file containing the private key and the certificate,"
68
- " or cert only if using client_key",
69
- ),
70
- click.Option(
71
- ["--client-key"],
72
- type=click.Path(),
73
- default=None,
74
- help="path to the file containing the private key"
75
- " if using separate cert and key files",
76
- ),
77
- ]
78
- return options
79
-
80
-
81
- opensearch_src_cmd = SrcCmd(
82
- cmd_name=CONNECTOR_TYPE,
83
- connection_config=OpenSearchCliConnectionConfig,
84
- indexer_config=ElasticsearchCliIndexerConfig,
85
- downloader_config=ElasticsearchCliDownloadConfig,
86
- )
87
-
88
- opensearch_dest_cmd = DestCmd(
89
- cmd_name=CONNECTOR_TYPE,
90
- connection_config=OpenSearchCliConnectionConfig,
91
- upload_stager_config=ElasticsearchCliUploadStagerConfig,
92
- uploader_config=ElasticsearchUploaderConfig,
93
- )
@@ -1,62 +0,0 @@
1
- from dataclasses import dataclass
2
-
3
- import click
4
-
5
- from unstructured_ingest.v2.cli.base import DestCmd
6
- from unstructured_ingest.v2.cli.interfaces import CliConfig
7
- from unstructured_ingest.v2.processes.connectors.pinecone import CONNECTOR_TYPE
8
-
9
-
10
- @dataclass
11
- class PineconeCliConnectionConfig(CliConfig):
12
- @staticmethod
13
- def get_cli_options() -> list[click.Option]:
14
- options = [
15
- click.Option(
16
- ["--api-key"],
17
- required=True,
18
- type=str,
19
- help="API key for Pinecone.",
20
- ),
21
- click.Option(
22
- ["--index-name"],
23
- required=True,
24
- type=str,
25
- help="Name of the index to connect to. Example: my-index",
26
- ),
27
- click.Option(
28
- ["--environment"],
29
- required=True,
30
- type=str,
31
- help="Environment to connect to. Example: us-east-1",
32
- ),
33
- ]
34
- return options
35
-
36
-
37
- @dataclass
38
- class PineconeCliUploaderConfig(CliConfig):
39
- @staticmethod
40
- def get_cli_options() -> list[click.Option]:
41
- options = [
42
- click.Option(
43
- ["--batch-size"],
44
- default=100,
45
- type=int,
46
- help="Number of records per batch",
47
- ),
48
- click.Option(
49
- ["--num-processes"],
50
- default=4,
51
- type=int,
52
- help="Number of processes to use for uploading",
53
- ),
54
- ]
55
- return options
56
-
57
-
58
- pinecone_dest_cmd = DestCmd(
59
- cmd_name=CONNECTOR_TYPE,
60
- connection_config=PineconeCliConnectionConfig,
61
- uploader_config=PineconeCliUploaderConfig,
62
- )
@@ -1,79 +0,0 @@
1
- from dataclasses import dataclass
2
-
3
- import click
4
-
5
- from unstructured_ingest.v2.cli.base import SrcCmd
6
- from unstructured_ingest.v2.cli.interfaces import CliConfig
7
- from unstructured_ingest.v2.cli.utils import DelimitedString
8
- from unstructured_ingest.v2.processes.connectors.salesforce import (
9
- ACCEPTED_CATEGORIES,
10
- CONNECTOR_TYPE,
11
- )
12
-
13
-
14
- @dataclass
15
- class SalesforceCliConnectionConfig(CliConfig):
16
- @staticmethod
17
- def get_cli_options() -> list[click.Option]:
18
- options = [
19
- click.Option(
20
- ["--username"],
21
- required=True,
22
- type=str,
23
- help="Salesforce username usually looks like an email.",
24
- ),
25
- click.Option(
26
- ["--consumer-key"],
27
- required=True,
28
- type=str,
29
- help="For the Salesforce JWT auth. Found in Consumer Details.",
30
- ),
31
- click.Option(
32
- ["--private-key"],
33
- required=True,
34
- type=str,
35
- help="Path to the private key or its contents for the Salesforce JWT auth. "
36
- "Key file is usually named server.key.",
37
- ),
38
- ]
39
- return options
40
-
41
-
42
- @dataclass
43
- class SalesforceCliIndexerConfig(CliConfig):
44
- @staticmethod
45
- def get_cli_options() -> list[click.Option]:
46
- possible_categories = ACCEPTED_CATEGORIES
47
- options = [
48
- click.Option(
49
- ["--categories"],
50
- default=None,
51
- required=True,
52
- type=DelimitedString(choices=possible_categories),
53
- help="Comma-delimited salesforce categories to download. "
54
- "Currently only {}.".format(", ".join(possible_categories)),
55
- ),
56
- ]
57
- return options
58
-
59
-
60
- @dataclass
61
- class SalesforceCliDownloadConfig(CliConfig):
62
- @staticmethod
63
- def get_cli_options() -> list[click.Option]:
64
- options = [
65
- click.Option(
66
- ["--download-dir"],
67
- help="Where files are downloaded to, defaults to a location at"
68
- "`$HOME/.cache/unstructured_ingest/<connector name>/<SHA256>`.",
69
- ),
70
- ]
71
- return options
72
-
73
-
74
- salesforce_src_cmd = SrcCmd(
75
- cmd_name=CONNECTOR_TYPE,
76
- connection_config=SalesforceCliConnectionConfig,
77
- indexer_config=SalesforceCliIndexerConfig,
78
- downloader_config=SalesforceCliDownloadConfig,
79
- )
@@ -1,112 +0,0 @@
1
- from dataclasses import dataclass
2
-
3
- import click
4
-
5
- from unstructured_ingest.v2.cli.base import SrcCmd
6
- from unstructured_ingest.v2.cli.interfaces import CliConfig
7
- from unstructured_ingest.v2.processes.connectors.sharepoint import CONNECTOR_TYPE
8
-
9
-
10
- @dataclass
11
- class SharepointCliConnectionConfig(CliConfig):
12
- @staticmethod
13
- def get_cli_options() -> list[click.Option]:
14
- options = [
15
- click.Option(
16
- ["--client-id"],
17
- default=None,
18
- type=str,
19
- help="Sharepoint app client ID",
20
- ),
21
- click.Option(
22
- ["--client-cred"],
23
- default=None,
24
- type=str,
25
- help="Sharepoint app secret",
26
- ),
27
- click.Option(
28
- ["--site"],
29
- default=None,
30
- type=str,
31
- help="Sharepoint site url. Process either base url e.g \
32
- https://[tenant].sharepoint.com or relative sites \
33
- https://[tenant].sharepoint.com/sites/<site_name>. \
34
- To process all sites within the tenant pass a site url as \
35
- https://[tenant]-admin.sharepoint.com.\
36
- This requires the app to be registered at a tenant level",
37
- ),
38
- click.Option(
39
- ["--permissions-application-id"],
40
- type=str,
41
- help="Microsoft Graph API application id",
42
- ),
43
- click.Option(
44
- ["--permissions-client-cred"],
45
- type=str,
46
- help="Microsoft Graph API application credentials",
47
- ),
48
- click.Option(
49
- ["--permissions-tenant"],
50
- type=str,
51
- help="e.g https://contoso.onmicrosoft.com to get permissions data within tenant.",
52
- ),
53
- ]
54
- return options
55
-
56
-
57
- @dataclass
58
- class SharepointCliIndexerConfig(CliConfig):
59
- @staticmethod
60
- def get_cli_options() -> list[click.Option]:
61
- options = [
62
- click.Option(
63
- ["--path"],
64
- default=None,
65
- type=str,
66
- help="Path from which to start parsing files. If the connector is to \
67
- process all sites within the tenant this filter will be applied to \
68
- all sites document libraries.",
69
- ),
70
- click.Option(
71
- ["--recursive"],
72
- is_flag=True,
73
- default=False,
74
- help="Recursively download files in their respective folders "
75
- "otherwise stop at the files in provided folder level.",
76
- ),
77
- click.Option(
78
- ["--omit-files"],
79
- is_flag=True,
80
- default=False,
81
- help="Don't process files.",
82
- ),
83
- click.Option(
84
- ["--omit-pages"],
85
- is_flag=True,
86
- default=False,
87
- help="Don't process site pages.",
88
- ),
89
- ]
90
- return options
91
-
92
-
93
- @dataclass
94
- class SharepointCliDownloadConfig(CliConfig):
95
- @staticmethod
96
- def get_cli_options() -> list[click.Option]:
97
- options = [
98
- click.Option(
99
- ["--download-dir"],
100
- help="Where files are downloaded to, defaults to a location at"
101
- "`$HOME/.cache/unstructured_ingest/<connector name>/<SHA256>`.",
102
- ),
103
- ]
104
- return options
105
-
106
-
107
- sharepoint_drive_src_cmd = SrcCmd(
108
- cmd_name=CONNECTOR_TYPE,
109
- connection_config=SharepointCliConnectionConfig,
110
- indexer_config=SharepointCliIndexerConfig,
111
- downloader_config=SharepointCliDownloadConfig,
112
- )
@@ -1,96 +0,0 @@
1
- from dataclasses import dataclass
2
-
3
- import click
4
-
5
- from unstructured_ingest.v2.cli.base import DestCmd
6
- from unstructured_ingest.v2.cli.interfaces import CliConfig
7
- from unstructured_ingest.v2.processes.connectors.singlestore import CONNECTOR_TYPE
8
-
9
-
10
- @dataclass
11
- class SingleStoreCliConnectionConfig(CliConfig):
12
- @staticmethod
13
- def get_cli_options() -> list[click.Option]:
14
- options = [
15
- click.Option(
16
- ["--host"],
17
- required=False,
18
- type=str,
19
- default=None,
20
- help="SingleStore host",
21
- ),
22
- click.Option(
23
- ["--port"],
24
- required=False,
25
- type=int,
26
- default=None,
27
- help="SingleStore port",
28
- ),
29
- click.Option(
30
- ["--user"],
31
- required=False,
32
- type=str,
33
- default=None,
34
- help="SingleStore user",
35
- ),
36
- click.Option(
37
- ["--password"],
38
- required=False,
39
- type=str,
40
- default=None,
41
- help="SingleStore password",
42
- ),
43
- click.Option(
44
- ["--database"],
45
- required=False,
46
- type=str,
47
- default=None,
48
- help="SingleStore database",
49
- ),
50
- ]
51
- return options
52
-
53
-
54
- @dataclass
55
- class SingleStoreCliUploaderConfig(CliConfig):
56
- @staticmethod
57
- def get_cli_options() -> list[click.Option]:
58
- options = [
59
- click.Option(
60
- ["--drop-empty-cols"],
61
- required=False,
62
- type=bool,
63
- is_flag=True,
64
- default=False,
65
- help="Drop any columns that have no data",
66
- ),
67
- ]
68
- return options
69
-
70
-
71
- @dataclass
72
- class SingleStoreCliUploadStagerConfig(CliConfig):
73
- @staticmethod
74
- def get_cli_options() -> list[click.Option]:
75
- return [
76
- click.Option(
77
- ["--table-name"],
78
- required=False,
79
- type=str,
80
- help="SingleStore table to write contents to",
81
- ),
82
- click.Option(
83
- ["--batch-size"],
84
- required=False,
85
- type=click.IntRange(min=1),
86
- help="Batch size when writing to SingleStore",
87
- ),
88
- ]
89
-
90
-
91
- singlestore_dest_cmd = DestCmd(
92
- cmd_name=CONNECTOR_TYPE,
93
- connection_config=SingleStoreCliConnectionConfig,
94
- uploader_config=SingleStoreCliUploaderConfig,
95
- upload_stager_config=SingleStoreCliUploadStagerConfig,
96
- )
@@ -1,84 +0,0 @@
1
- from dataclasses import dataclass
2
-
3
- import click
4
-
5
- from unstructured_ingest.v2.cli.base import DestCmd
6
- from unstructured_ingest.v2.cli.interfaces import CliConfig
7
- from unstructured_ingest.v2.processes.connectors.sql import CONNECTOR_TYPE
8
-
9
- SQL_DRIVERS = {"postgresql", "sqlite"}
10
-
11
-
12
- @dataclass
13
- class SQLCliConnectionConfig(CliConfig):
14
- @staticmethod
15
- def get_cli_options() -> list[click.Option]:
16
- options = [
17
- click.Option(
18
- ["--db-type"],
19
- required=True,
20
- type=click.Choice(SQL_DRIVERS),
21
- help="Type of the database backend",
22
- ),
23
- click.Option(
24
- ["--username"],
25
- default=None,
26
- type=str,
27
- help="DB username",
28
- ),
29
- click.Option(
30
- ["--password"],
31
- default=None,
32
- type=str,
33
- help="DB password",
34
- ),
35
- click.Option(
36
- ["--host"],
37
- default=None,
38
- type=str,
39
- help="DB host",
40
- ),
41
- click.Option(
42
- ["--port"],
43
- default=None,
44
- type=int,
45
- help="DB host connection port",
46
- ),
47
- click.Option(
48
- ["--database"],
49
- default=None,
50
- type=str,
51
- help="Database name. For sqlite databases, this is the path to the .db file.",
52
- ),
53
- ]
54
- return options
55
-
56
-
57
- @dataclass
58
- class SQLCliUploaderConfig(CliConfig):
59
- @staticmethod
60
- def get_cli_options() -> list[click.Option]:
61
- options = [
62
- click.Option(
63
- ["--batch-size"],
64
- default=100,
65
- type=int,
66
- help="Number of records per batch",
67
- )
68
- ]
69
- return options
70
-
71
-
72
- @dataclass
73
- class SQLCliUploadStagerConfig(CliConfig):
74
- @staticmethod
75
- def get_cli_options() -> list[click.Option]:
76
- return []
77
-
78
-
79
- sql_dest_cmd = DestCmd(
80
- cmd_name=CONNECTOR_TYPE,
81
- connection_config=SQLCliConnectionConfig,
82
- uploader_config=SQLCliUploaderConfig,
83
- upload_stager_config=SQLCliUploadStagerConfig,
84
- )
@@ -1,100 +0,0 @@
1
- from dataclasses import dataclass
2
-
3
- import click
4
-
5
- from unstructured_ingest.v2.cli.base import DestCmd
6
- from unstructured_ingest.v2.cli.interfaces import CliConfig
7
- from unstructured_ingest.v2.cli.utils import DelimitedString
8
- from unstructured_ingest.v2.processes.connectors.weaviate import CONNECTOR_TYPE
9
-
10
-
11
- @dataclass
12
- class WeaviateCliConnectionConfig(CliConfig):
13
- @staticmethod
14
- def get_cli_options() -> list[click.Option]:
15
- options = [
16
- click.Option(
17
- ["--host-url"],
18
- required=True,
19
- help="Weaviate instance url",
20
- ),
21
- click.Option(
22
- ["--class-name"],
23
- default=None,
24
- type=str,
25
- help="Name of the class to push the records into, e.g: Pdf-elements",
26
- ),
27
- click.Option(
28
- ["--access-token"], default=None, type=str, help="Used to create the bearer token."
29
- ),
30
- click.Option(
31
- ["--refresh-token"],
32
- default=None,
33
- type=str,
34
- help="Will tie this value to the bearer token. If not provided, "
35
- "the authentication will expire once the lifetime of the access token is up.",
36
- ),
37
- click.Option(
38
- ["--api-key"],
39
- default=None,
40
- type=str,
41
- ),
42
- click.Option(
43
- ["--client-secret"],
44
- default=None,
45
- type=str,
46
- ),
47
- click.Option(
48
- ["--scope"],
49
- default=None,
50
- type=DelimitedString(),
51
- ),
52
- click.Option(
53
- ["--username"],
54
- default=None,
55
- type=str,
56
- ),
57
- click.Option(
58
- ["--password"],
59
- default=None,
60
- type=str,
61
- ),
62
- click.Option(
63
- ["--anonymous"],
64
- is_flag=True,
65
- default=False,
66
- type=bool,
67
- help="if set, all auth values will be ignored",
68
- ),
69
- ]
70
- return options
71
-
72
-
73
- @dataclass
74
- class WeaviateCliUploaderConfig(CliConfig):
75
- @staticmethod
76
- def get_cli_options() -> list[click.Option]:
77
- options = [
78
- click.Option(
79
- ["--batch-size"],
80
- default=100,
81
- type=int,
82
- help="Number of records per batch",
83
- )
84
- ]
85
- return options
86
-
87
-
88
- @dataclass
89
- class WeaviateCliUploadStagerConfig(CliConfig):
90
- @staticmethod
91
- def get_cli_options() -> list[click.Option]:
92
- return []
93
-
94
-
95
- weaviate_dest_cmd = DestCmd(
96
- cmd_name=CONNECTOR_TYPE,
97
- connection_config=WeaviateCliConnectionConfig,
98
- uploader_config=WeaviateCliUploaderConfig,
99
- upload_stager_config=WeaviateCliUploadStagerConfig,
100
- )
@@ -1,6 +0,0 @@
1
- from .chunk import ChunkerCliConfig
2
- from .embed import EmbedderCliConfig
3
- from .partition import PartitionerCliConfig
4
- from .processor import ProcessorCliConfig
5
-
6
- __all__ = ["ChunkerCliConfig", "ProcessorCliConfig", "PartitionerCliConfig", "EmbedderCliConfig"]