unstructured-ingest 0.0.3__py3-none-any.whl → 0.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (125) hide show
  1. unstructured_ingest/__version__.py +1 -1
  2. unstructured_ingest/cli/cli.py +6 -1
  3. unstructured_ingest/cli/cmds/__init__.py +4 -4
  4. unstructured_ingest/cli/cmds/{astra.py → astradb.py} +9 -9
  5. unstructured_ingest/cli/interfaces.py +13 -6
  6. unstructured_ingest/connector/{astra.py → astradb.py} +29 -29
  7. unstructured_ingest/connector/biomed.py +12 -5
  8. unstructured_ingest/connector/confluence.py +3 -3
  9. unstructured_ingest/connector/github.py +3 -2
  10. unstructured_ingest/connector/google_drive.py +1 -2
  11. unstructured_ingest/connector/mongodb.py +1 -2
  12. unstructured_ingest/connector/notion/client.py +31 -16
  13. unstructured_ingest/connector/notion/connector.py +3 -2
  14. unstructured_ingest/connector/registry.py +2 -2
  15. unstructured_ingest/connector/vectara.py +7 -2
  16. unstructured_ingest/interfaces.py +13 -9
  17. unstructured_ingest/pipeline/interfaces.py +8 -3
  18. unstructured_ingest/pipeline/reformat/chunking.py +13 -9
  19. unstructured_ingest/pipeline/reformat/embedding.py +3 -3
  20. unstructured_ingest/runner/__init__.py +2 -2
  21. unstructured_ingest/runner/{astra.py → astradb.py} +7 -7
  22. unstructured_ingest/runner/writers/__init__.py +2 -2
  23. unstructured_ingest/runner/writers/{astra.py → astradb.py} +7 -7
  24. unstructured_ingest/utils/chunking.py +45 -0
  25. unstructured_ingest/utils/dep_check.py +1 -1
  26. unstructured_ingest/utils/google_filetype.py +9 -0
  27. unstructured_ingest/v2/cli/base/cmd.py +57 -13
  28. unstructured_ingest/v2/cli/base/dest.py +21 -12
  29. unstructured_ingest/v2/cli/base/src.py +35 -23
  30. unstructured_ingest/v2/cli/cmds.py +14 -0
  31. unstructured_ingest/v2/cli/{utils.py → utils/click.py} +36 -89
  32. unstructured_ingest/v2/cli/utils/model_conversion.py +199 -0
  33. unstructured_ingest/v2/interfaces/connector.py +5 -7
  34. unstructured_ingest/v2/interfaces/downloader.py +8 -5
  35. unstructured_ingest/v2/interfaces/file_data.py +8 -2
  36. unstructured_ingest/v2/interfaces/indexer.py +3 -4
  37. unstructured_ingest/v2/interfaces/processor.py +10 -10
  38. unstructured_ingest/v2/interfaces/upload_stager.py +3 -3
  39. unstructured_ingest/v2/interfaces/uploader.py +3 -3
  40. unstructured_ingest/v2/pipeline/pipeline.py +9 -6
  41. unstructured_ingest/v2/pipeline/steps/chunk.py +5 -11
  42. unstructured_ingest/v2/pipeline/steps/download.py +13 -11
  43. unstructured_ingest/v2/pipeline/steps/embed.py +5 -11
  44. unstructured_ingest/v2/pipeline/steps/filter.py +1 -6
  45. unstructured_ingest/v2/pipeline/steps/index.py +14 -10
  46. unstructured_ingest/v2/pipeline/steps/partition.py +5 -5
  47. unstructured_ingest/v2/pipeline/steps/stage.py +4 -7
  48. unstructured_ingest/v2/pipeline/steps/uncompress.py +1 -6
  49. unstructured_ingest/v2/pipeline/steps/upload.py +2 -9
  50. unstructured_ingest/v2/processes/__init__.py +18 -0
  51. unstructured_ingest/v2/processes/chunker.py +74 -28
  52. unstructured_ingest/v2/processes/connector_registry.py +8 -2
  53. unstructured_ingest/v2/processes/connectors/__init__.py +18 -3
  54. unstructured_ingest/v2/processes/connectors/{astra.py → astradb.py} +46 -39
  55. unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +30 -27
  56. unstructured_ingest/v2/processes/connectors/chroma.py +30 -21
  57. unstructured_ingest/v2/processes/connectors/couchbase.py +333 -0
  58. unstructured_ingest/v2/processes/connectors/databricks_volumes.py +87 -32
  59. unstructured_ingest/v2/processes/connectors/elasticsearch.py +70 -45
  60. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +39 -16
  61. unstructured_ingest/v2/processes/connectors/fsspec/box.py +15 -13
  62. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +10 -11
  63. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +20 -34
  64. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +38 -13
  65. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +31 -17
  66. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +19 -28
  67. unstructured_ingest/v2/processes/connectors/google_drive.py +40 -34
  68. unstructured_ingest/v2/processes/connectors/kdbai.py +170 -0
  69. unstructured_ingest/v2/processes/connectors/local.py +27 -16
  70. unstructured_ingest/v2/processes/connectors/milvus.py +22 -18
  71. unstructured_ingest/v2/processes/connectors/mongodb.py +22 -18
  72. unstructured_ingest/v2/processes/connectors/onedrive.py +17 -14
  73. unstructured_ingest/v2/processes/connectors/opensearch.py +66 -56
  74. unstructured_ingest/v2/processes/connectors/pinecone.py +22 -21
  75. unstructured_ingest/v2/processes/connectors/salesforce.py +26 -18
  76. unstructured_ingest/v2/processes/connectors/sharepoint.py +51 -26
  77. unstructured_ingest/v2/processes/connectors/singlestore.py +11 -15
  78. unstructured_ingest/v2/processes/connectors/sql.py +29 -31
  79. unstructured_ingest/v2/processes/connectors/weaviate.py +22 -13
  80. unstructured_ingest/v2/processes/embedder.py +106 -47
  81. unstructured_ingest/v2/processes/filter.py +11 -5
  82. unstructured_ingest/v2/processes/partitioner.py +79 -33
  83. unstructured_ingest/v2/processes/uncompress.py +3 -3
  84. unstructured_ingest/v2/utils.py +45 -0
  85. unstructured_ingest-0.0.5.dist-info/LICENSE.md +201 -0
  86. unstructured_ingest-0.0.5.dist-info/METADATA +574 -0
  87. {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.5.dist-info}/RECORD +91 -116
  88. {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.5.dist-info}/WHEEL +1 -1
  89. unstructured_ingest/v2/cli/cmds/__init__.py +0 -89
  90. unstructured_ingest/v2/cli/cmds/astra.py +0 -85
  91. unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +0 -72
  92. unstructured_ingest/v2/cli/cmds/chroma.py +0 -108
  93. unstructured_ingest/v2/cli/cmds/databricks_volumes.py +0 -161
  94. unstructured_ingest/v2/cli/cmds/elasticsearch.py +0 -159
  95. unstructured_ingest/v2/cli/cmds/fsspec/azure.py +0 -84
  96. unstructured_ingest/v2/cli/cmds/fsspec/box.py +0 -58
  97. unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +0 -58
  98. unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +0 -69
  99. unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +0 -81
  100. unstructured_ingest/v2/cli/cmds/fsspec/s3.py +0 -84
  101. unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +0 -80
  102. unstructured_ingest/v2/cli/cmds/google_drive.py +0 -74
  103. unstructured_ingest/v2/cli/cmds/local.py +0 -52
  104. unstructured_ingest/v2/cli/cmds/milvus.py +0 -72
  105. unstructured_ingest/v2/cli/cmds/mongodb.py +0 -62
  106. unstructured_ingest/v2/cli/cmds/onedrive.py +0 -91
  107. unstructured_ingest/v2/cli/cmds/opensearch.py +0 -93
  108. unstructured_ingest/v2/cli/cmds/pinecone.py +0 -62
  109. unstructured_ingest/v2/cli/cmds/salesforce.py +0 -79
  110. unstructured_ingest/v2/cli/cmds/sharepoint.py +0 -112
  111. unstructured_ingest/v2/cli/cmds/singlestore.py +0 -96
  112. unstructured_ingest/v2/cli/cmds/sql.py +0 -84
  113. unstructured_ingest/v2/cli/cmds/weaviate.py +0 -100
  114. unstructured_ingest/v2/cli/configs/__init__.py +0 -13
  115. unstructured_ingest/v2/cli/configs/chunk.py +0 -89
  116. unstructured_ingest/v2/cli/configs/embed.py +0 -74
  117. unstructured_ingest/v2/cli/configs/filter.py +0 -28
  118. unstructured_ingest/v2/cli/configs/partition.py +0 -99
  119. unstructured_ingest/v2/cli/configs/processor.py +0 -88
  120. unstructured_ingest/v2/cli/interfaces.py +0 -27
  121. unstructured_ingest/v2/pipeline/utils.py +0 -15
  122. unstructured_ingest-0.0.3.dist-info/METADATA +0 -175
  123. /unstructured_ingest/v2/cli/{cmds/fsspec → utils}/__init__.py +0 -0
  124. {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.5.dist-info}/entry_points.txt +0 -0
  125. {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.5.dist-info}/top_level.txt +0 -0
@@ -1,108 +0,0 @@
1
- from dataclasses import dataclass
2
-
3
- import click
4
-
5
- from unstructured_ingest.v2.cli.base import DestCmd
6
- from unstructured_ingest.v2.cli.interfaces import CliConfig
7
- from unstructured_ingest.v2.cli.utils import Dict
8
- from unstructured_ingest.v2.processes.connectors.chroma import CONNECTOR_TYPE
9
-
10
-
11
- @dataclass
12
- class ChromaCliConnectionConfig(CliConfig):
13
- @staticmethod
14
- def get_cli_options() -> list[click.Option]:
15
- options = [
16
- click.Option(
17
- ["--path"],
18
- required=False,
19
- type=str,
20
- help="Location where Chroma is persisted," "if not connecting via http.",
21
- ),
22
- click.Option(
23
- ["--settings"],
24
- required=False,
25
- type=Dict(),
26
- help="A dictionary of settings to communicate with the chroma server."
27
- 'example: \'{"persist_directory":"./chroma-persist"}\' ',
28
- ),
29
- click.Option(
30
- ["--tenant"],
31
- required=False,
32
- default="default_tenant",
33
- type=str,
34
- help="The tenant to use for this client. Chroma defaults to 'default_tenant'.",
35
- ),
36
- click.Option(
37
- ["--database"],
38
- required=False,
39
- default="default_database",
40
- type=str,
41
- help="The database to use for this client."
42
- "Chroma defaults to 'default_database'.",
43
- ),
44
- click.Option(
45
- ["--host"],
46
- required=False,
47
- type=str,
48
- help="The hostname of the Chroma server.",
49
- ),
50
- click.Option(
51
- ["--port"],
52
- required=False,
53
- type=int,
54
- help="The port of the Chroma server.",
55
- ),
56
- click.Option(
57
- ["--ssl"],
58
- required=False,
59
- default=False,
60
- is_flag=True,
61
- type=bool,
62
- help="Whether to use SSL to connect to the Chroma server.",
63
- ),
64
- click.Option(
65
- ["--headers"],
66
- required=False,
67
- type=Dict(),
68
- help="A dictionary of headers to send to the Chroma server."
69
- 'example: \'{"Authorization":"Basic()"}\' ',
70
- ),
71
- click.Option(
72
- ["--collection-name"],
73
- required=True,
74
- type=str,
75
- help="The name of the Chroma collection to write into.",
76
- ),
77
- ]
78
- return options
79
-
80
-
81
- @dataclass
82
- class ChromaCliUploaderConfig(CliConfig):
83
- @staticmethod
84
- def get_cli_options() -> list[click.Option]:
85
- options = [
86
- click.Option(
87
- ["--batch-size"],
88
- default=100,
89
- type=int,
90
- help="Number of records per batch",
91
- )
92
- ]
93
- return options
94
-
95
-
96
- @dataclass
97
- class ChromaCliUploadStagerConfig(CliConfig):
98
- @staticmethod
99
- def get_cli_options() -> list[click.Option]:
100
- return []
101
-
102
-
103
- chroma_dest_cmd = DestCmd(
104
- cmd_name=CONNECTOR_TYPE,
105
- connection_config=ChromaCliConnectionConfig,
106
- uploader_config=ChromaCliUploaderConfig,
107
- upload_stager_config=ChromaCliUploadStagerConfig,
108
- )
@@ -1,161 +0,0 @@
1
- from dataclasses import dataclass
2
-
3
- import click
4
-
5
- from unstructured_ingest.v2.cli.base import DestCmd
6
- from unstructured_ingest.v2.cli.interfaces import CliConfig
7
- from unstructured_ingest.v2.processes.connectors.databricks_volumes import CONNECTOR_TYPE
8
-
9
-
10
- @dataclass
11
- class DatabricksVolumesCliConnectionConfig(CliConfig):
12
- @staticmethod
13
- def get_cli_options() -> list[click.Option]:
14
- options = [
15
- click.Option(
16
- ["--host"],
17
- type=str,
18
- default=None,
19
- help="The Databricks host URL for either the "
20
- "Databricks workspace endpoint or the "
21
- "Databricks accounts endpoint.",
22
- ),
23
- click.Option(
24
- ["--account-id"],
25
- type=str,
26
- default=None,
27
- help="The Databricks account ID for the Databricks "
28
- "accounts endpoint. Only has effect when Host is "
29
- "either https://accounts.cloud.databricks.com/ (AWS), "
30
- "https://accounts.azuredatabricks.net/ (Azure), "
31
- "or https://accounts.gcp.databricks.com/ (GCP).",
32
- ),
33
- click.Option(
34
- ["--username"],
35
- type=str,
36
- default=None,
37
- help="The Databricks username part of basic authentication. "
38
- "Only possible when Host is *.cloud.databricks.com (AWS).",
39
- ),
40
- click.Option(
41
- ["--password"],
42
- type=str,
43
- default=None,
44
- help="The Databricks password part of basic authentication. "
45
- "Only possible when Host is *.cloud.databricks.com (AWS).",
46
- ),
47
- click.Option(["--client-id"], type=str, default=None),
48
- click.Option(["--client-secret"], type=str, default=None),
49
- click.Option(
50
- ["--token"],
51
- type=str,
52
- default=None,
53
- help="The Databricks personal access token (PAT) (AWS, Azure, and GCP) or "
54
- "Azure Active Directory (Azure AD) token (Azure).",
55
- ),
56
- click.Option(
57
- ["--azure-workspace-resource-id"],
58
- type=str,
59
- default=None,
60
- help="The Azure Resource Manager ID for the Azure Databricks workspace, "
61
- "which is exchanged for a Databricks host URL.",
62
- ),
63
- click.Option(
64
- ["--azure-client-secret"],
65
- type=str,
66
- default=None,
67
- help="The Azure AD service principal’s client secret.",
68
- ),
69
- click.Option(
70
- ["--azure-client-id"],
71
- type=str,
72
- default=None,
73
- help="The Azure AD service principal’s application ID.",
74
- ),
75
- click.Option(
76
- ["--azure-tenant-id"],
77
- type=str,
78
- default=None,
79
- help="The Azure AD service principal’s tenant ID.",
80
- ),
81
- click.Option(
82
- ["--azure-environment"],
83
- type=str,
84
- default=None,
85
- help="The Azure environment type (such as Public, UsGov, China, and Germany) for a "
86
- "specific set of API endpoints. Defaults to PUBLIC.",
87
- ),
88
- click.Option(
89
- ["--auth-type"],
90
- type=str,
91
- default=None,
92
- help="When multiple auth attributes are available in the "
93
- "environment, use the auth type specified by this "
94
- "argument. This argument also holds the currently "
95
- "selected auth.",
96
- ),
97
- click.Option(["--cluster-id"], type=str, default=None),
98
- click.Option(["--google-credentials"], type=str, default=None),
99
- click.Option(["--google-service-account"], type=str, default=None),
100
- ]
101
- return options
102
-
103
-
104
- @dataclass
105
- class DatabricksVolumesCliUploaderConfig(CliConfig):
106
- @staticmethod
107
- def get_cli_options() -> list[click.Option]:
108
- options = [
109
- click.Option(
110
- ["--volume"], type=str, required=True, help="Name of volume in the Unity Catalog"
111
- ),
112
- click.Option(
113
- ["--catalog"],
114
- type=str,
115
- required=True,
116
- help="Name of the catalog in the Databricks Unity Catalog service",
117
- ),
118
- click.Option(
119
- ["--volume-path"],
120
- type=str,
121
- required=False,
122
- default=None,
123
- help="Optional path within the volume to write to",
124
- ),
125
- click.Option(
126
- ["--overwrite"],
127
- type=bool,
128
- is_flag=True,
129
- help="If true, an existing file will be overwritten.",
130
- ),
131
- click.Option(
132
- ["--encoding"],
133
- type=str,
134
- required=True,
135
- default="utf-8",
136
- help="Encoding applied to the data when written to the volume",
137
- ),
138
- click.Option(
139
- ["--schema"],
140
- type=str,
141
- required=True,
142
- default="default",
143
- help="Schema associated with the volume to write to in the Unity Catalog service",
144
- ),
145
- ]
146
- return options
147
-
148
-
149
- @dataclass
150
- class DatabricksVolumesCliUploadStagerConfig(CliConfig):
151
- @staticmethod
152
- def get_cli_options() -> list[click.Option]:
153
- return []
154
-
155
-
156
- databricks_volumes_dest_cmd = DestCmd(
157
- cmd_name=CONNECTOR_TYPE,
158
- connection_config=DatabricksVolumesCliConnectionConfig,
159
- uploader_config=DatabricksVolumesCliUploaderConfig,
160
- upload_stager_config=DatabricksVolumesCliUploadStagerConfig,
161
- )
@@ -1,159 +0,0 @@
1
- from dataclasses import dataclass
2
-
3
- import click
4
-
5
- from unstructured_ingest.v2.cli.base import DestCmd, SrcCmd
6
- from unstructured_ingest.v2.cli.interfaces import CliConfig
7
- from unstructured_ingest.v2.cli.utils import DelimitedString
8
- from unstructured_ingest.v2.processes.connectors.elasticsearch import CONNECTOR_TYPE
9
-
10
-
11
- @dataclass
12
- class ElasticsearchCliConnectionConfig(CliConfig):
13
- @staticmethod
14
- def get_cli_options() -> list[click.Option]:
15
- options = [
16
- click.Option(
17
- ["--hosts"],
18
- type=DelimitedString(),
19
- help='List of the Elasticsearch hosts to connect to, e.g. "http://localhost:9200"',
20
- ),
21
- click.Option(
22
- ["--username"], type=str, default=None, help="username when using basic auth"
23
- ),
24
- click.Option(
25
- ["--password"],
26
- type=str,
27
- default=None,
28
- help="password when using basic auth or connecting to a cloud instance",
29
- ),
30
- click.Option(
31
- ["--cloud-id"], type=str, default=None, help="id used to connect to Elastic Cloud"
32
- ),
33
- click.Option(
34
- ["--es-api-key"], type=str, default=None, help="api key used for authentication"
35
- ),
36
- click.Option(
37
- ["--api-key-id"],
38
- type=str,
39
- default=None,
40
- help="id associated with api key used for authentication: "
41
- "https://www.elastic.co/guide/en/elasticsearch/reference/current/security-api-create-api-key.html", # noqa: E501
42
- # noqa: E501
43
- ),
44
- click.Option(
45
- ["--bearer-auth"],
46
- type=str,
47
- default=None,
48
- help="bearer token used for HTTP bearer authentication",
49
- ),
50
- click.Option(
51
- ["--ca-certs"],
52
- type=click.Path(),
53
- default=None,
54
- ),
55
- click.Option(
56
- ["--ssl-assert-fingerprint"],
57
- type=str,
58
- default=None,
59
- help="SHA256 fingerprint value",
60
- ),
61
- ]
62
- return options
63
-
64
-
65
- @dataclass
66
- class ElasticsearchCliDownloadConfig(CliConfig):
67
- @staticmethod
68
- def get_cli_options() -> list[click.Option]:
69
- options = [
70
- click.Option(
71
- ["--download-dir"],
72
- help="Where files are downloaded to, defaults to a location at"
73
- "`$HOME/.cache/unstructured_ingest/<connector name>/<SHA256>`.",
74
- ),
75
- click.Option(
76
- ["--fields"],
77
- type=DelimitedString(),
78
- default=[],
79
- help="If provided, will limit the fields returned by Elasticsearch "
80
- "to this comma-delimited list",
81
- ),
82
- ]
83
- return options
84
-
85
-
86
- @dataclass
87
- class ElasticsearchCliIndexerConfig(CliConfig):
88
- @staticmethod
89
- def get_cli_options() -> list[click.Option]:
90
- options = [
91
- click.Option(
92
- ["--index-name"],
93
- required=True,
94
- type=str,
95
- help="Name of the Elasticsearch index to pull data from, or upload data to.",
96
- ),
97
- click.Option(
98
- ["--batch-size"],
99
- default=100,
100
- type=click.IntRange(0),
101
- help="how many records to read at a time per process",
102
- ),
103
- ]
104
- return options
105
-
106
-
107
- @dataclass
108
- class ElasticsearchCliUploadStagerConfig(CliConfig):
109
- @staticmethod
110
- def get_cli_options() -> list[click.Option]:
111
- options = [
112
- click.Option(
113
- ["--index-name"],
114
- required=True,
115
- type=str,
116
- help="Name of the Elasticsearch index to pull data from, or upload data to.",
117
- ),
118
- ]
119
- return options
120
-
121
-
122
- @dataclass
123
- class ElasticsearchUploaderConfig(CliConfig):
124
- @staticmethod
125
- def get_cli_options() -> list[click.Option]:
126
- options = [
127
- click.Option(
128
- ["--batch-size-bytes"],
129
- required=False,
130
- default=15_000_000,
131
- type=int,
132
- help="Size limit (in bytes) for each batch of items to be uploaded. Check"
133
- " https://www.elastic.co/guide/en/elasticsearch/guide/current/bulk.html"
134
- "#_how_big_is_too_big for more information.",
135
- ),
136
- click.Option(
137
- ["--num-threads"],
138
- required=False,
139
- default=1,
140
- type=int,
141
- help="Number of threads to be used while uploading content",
142
- ),
143
- ]
144
- return options
145
-
146
-
147
- elasticsearch_src_cmd = SrcCmd(
148
- cmd_name=CONNECTOR_TYPE,
149
- connection_config=ElasticsearchCliConnectionConfig,
150
- indexer_config=ElasticsearchCliIndexerConfig,
151
- downloader_config=ElasticsearchCliDownloadConfig,
152
- )
153
-
154
- elasticsearch_dest_cmd = DestCmd(
155
- cmd_name=CONNECTOR_TYPE,
156
- connection_config=ElasticsearchCliConnectionConfig,
157
- upload_stager_config=ElasticsearchCliUploadStagerConfig,
158
- uploader_config=ElasticsearchUploaderConfig,
159
- )
@@ -1,84 +0,0 @@
1
- from dataclasses import dataclass
2
-
3
- import click
4
-
5
- from unstructured_ingest.v2.cli.base import DestCmd, SrcCmd
6
- from unstructured_ingest.v2.cli.cmds.fsspec.fsspec import (
7
- FsspecCliDownloadConfig,
8
- FsspecCliIndexerConfig,
9
- FsspecCliUploaderConfig,
10
- )
11
- from unstructured_ingest.v2.cli.interfaces import CliConfig
12
- from unstructured_ingest.v2.processes.connectors.fsspec.azure import (
13
- CONNECTOR_TYPE,
14
- )
15
-
16
-
17
- @dataclass
18
- class AzureCliDownloadConfig(FsspecCliDownloadConfig):
19
- pass
20
-
21
-
22
- @dataclass
23
- class AzureCliIndexerConfig(FsspecCliIndexerConfig):
24
- pass
25
-
26
-
27
- @dataclass
28
- class AzureCliConnectionConfig(CliConfig):
29
- @staticmethod
30
- def get_cli_options() -> list[click.Option]:
31
- options = [
32
- click.Option(
33
- ["--account-key"],
34
- default=None,
35
- help="The storage account key. This is used for shared key "
36
- "authentication. If any of account key, sas token or "
37
- "client_id are not specified, anonymous access will be used.",
38
- ),
39
- click.Option(
40
- ["--account-name"],
41
- default=None,
42
- help="The storage account name. This is used to authenticate "
43
- "requests signed with an account key and to construct "
44
- "the storage endpoint. It is required unless a connection "
45
- "string is given, or if a custom domain is used with "
46
- "anonymous authentication.",
47
- ),
48
- click.Option(
49
- ["--connection-string"],
50
- default=None,
51
- help="If specified, this will override all other parameters. See "
52
- "http://azure.microsoft.com/en-us/documentation/articles/storage-configure-connection-string/ " # noqa: E501
53
- "for the connection string format.",
54
- ),
55
- click.Option(
56
- ["--sas_token"],
57
- default=None,
58
- help="A shared access signature token to use to authenticate "
59
- "requests instead of the account key. If account key and "
60
- "sas token are both specified, account key will be used "
61
- "to sign. If any of account key, sas token or client_id "
62
- "are not specified, anonymous access will be used.",
63
- ),
64
- ]
65
- return options
66
-
67
-
68
- @dataclass
69
- class AzureUploaderConfig(FsspecCliUploaderConfig):
70
- pass
71
-
72
-
73
- azure_src_cmd = SrcCmd(
74
- cmd_name=CONNECTOR_TYPE,
75
- indexer_config=AzureCliIndexerConfig,
76
- connection_config=AzureCliConnectionConfig,
77
- downloader_config=AzureCliDownloadConfig,
78
- )
79
-
80
- azure_dest_cmd = DestCmd(
81
- cmd_name=CONNECTOR_TYPE,
82
- connection_config=AzureCliConnectionConfig,
83
- uploader_config=AzureUploaderConfig,
84
- )
@@ -1,58 +0,0 @@
1
- from dataclasses import dataclass
2
-
3
- import click
4
-
5
- from unstructured_ingest.v2.cli.base import DestCmd, SrcCmd
6
- from unstructured_ingest.v2.cli.cmds.fsspec.fsspec import (
7
- FsspecCliDownloadConfig,
8
- FsspecCliIndexerConfig,
9
- FsspecCliUploaderConfig,
10
- )
11
- from unstructured_ingest.v2.cli.interfaces import CliConfig
12
- from unstructured_ingest.v2.processes.connectors.fsspec.box import (
13
- CONNECTOR_TYPE,
14
- )
15
-
16
-
17
- @dataclass
18
- class BoxCliDownloadConfig(FsspecCliDownloadConfig):
19
- pass
20
-
21
-
22
- @dataclass
23
- class BoxCliIndexerConfig(FsspecCliIndexerConfig):
24
- pass
25
-
26
-
27
- @dataclass
28
- class BoxCliConnectionConfig(CliConfig):
29
- @staticmethod
30
- def get_cli_options() -> list[click.Option]:
31
- options = [
32
- click.Option(
33
- ["--box-app-config"],
34
- default=None,
35
- type=click.Path(),
36
- help="Path to Box app credentials as json file.",
37
- ),
38
- ]
39
- return options
40
-
41
-
42
- @dataclass
43
- class BoxUploaderConfig(FsspecCliUploaderConfig):
44
- pass
45
-
46
-
47
- box_src_cmd = SrcCmd(
48
- cmd_name=CONNECTOR_TYPE,
49
- indexer_config=BoxCliIndexerConfig,
50
- connection_config=BoxCliConnectionConfig,
51
- downloader_config=BoxCliDownloadConfig,
52
- )
53
-
54
- box_dest_cmd = DestCmd(
55
- cmd_name=CONNECTOR_TYPE,
56
- connection_config=BoxCliConnectionConfig,
57
- uploader_config=BoxUploaderConfig,
58
- )
@@ -1,58 +0,0 @@
1
- from dataclasses import dataclass
2
-
3
- import click
4
-
5
- from unstructured_ingest.v2.cli.base import DestCmd, SrcCmd
6
- from unstructured_ingest.v2.cli.cmds.fsspec.fsspec import (
7
- FsspecCliDownloadConfig,
8
- FsspecCliIndexerConfig,
9
- FsspecCliUploaderConfig,
10
- )
11
- from unstructured_ingest.v2.cli.interfaces import CliConfig
12
- from unstructured_ingest.v2.processes.connectors.fsspec.dropbox import (
13
- CONNECTOR_TYPE,
14
- )
15
-
16
-
17
- @dataclass
18
- class DropboxCliDownloadConfig(FsspecCliDownloadConfig):
19
- pass
20
-
21
-
22
- @dataclass
23
- class DropboxCliIndexerConfig(FsspecCliIndexerConfig):
24
- pass
25
-
26
-
27
- @dataclass
28
- class DropboxCliConnectionConfig(CliConfig):
29
- @staticmethod
30
- def get_cli_options() -> list[click.Option]:
31
- options = [
32
- click.Option(
33
- ["--token"],
34
- required=True,
35
- type=str,
36
- help="Dropbox access token.",
37
- ),
38
- ]
39
- return options
40
-
41
-
42
- @dataclass
43
- class DropboxUploaderConfig(FsspecCliUploaderConfig):
44
- pass
45
-
46
-
47
- dropbox_src_cmd = SrcCmd(
48
- cmd_name=CONNECTOR_TYPE,
49
- indexer_config=DropboxCliIndexerConfig,
50
- connection_config=DropboxCliConnectionConfig,
51
- downloader_config=DropboxCliDownloadConfig,
52
- )
53
-
54
- dropbox_dest_cmd = DestCmd(
55
- cmd_name=CONNECTOR_TYPE,
56
- connection_config=DropboxCliConnectionConfig,
57
- uploader_config=DropboxUploaderConfig,
58
- )
@@ -1,69 +0,0 @@
1
- from dataclasses import dataclass
2
-
3
- import click
4
-
5
- from unstructured_ingest.v2.cli.interfaces import CliConfig
6
-
7
-
8
- @dataclass
9
- class FsspecCliDownloadConfig(CliConfig):
10
- @staticmethod
11
- def get_cli_options() -> list[click.Option]:
12
- return [
13
- click.Option(
14
- ["--download-dir"],
15
- help="Where files are downloaded to, defaults to a location at"
16
- "`$HOME/.cache/unstructured/ingest/<connector name>/<SHA256>`.",
17
- ),
18
- ]
19
-
20
-
21
- @dataclass
22
- class FsspecCliFileConfig(CliConfig):
23
- @staticmethod
24
- def get_cli_options() -> list[click.Option]:
25
- return [
26
- click.Option(
27
- ["--remote-url"],
28
- required=True,
29
- help="Remote fsspec URL formatted as `protocol://dir/path`",
30
- )
31
- ]
32
-
33
-
34
- @dataclass
35
- class FsspecCliUploaderConfig(FsspecCliFileConfig):
36
- @staticmethod
37
- def get_cli_options() -> list[click.Option]:
38
- options = super(FsspecCliUploaderConfig, FsspecCliUploaderConfig).get_cli_options()
39
- options.extend(
40
- [
41
- click.Option(
42
- ["--overwrite"],
43
- is_flag=True,
44
- default=False,
45
- show_default=True,
46
- help="If set, will overwrite content if content already exists",
47
- )
48
- ]
49
- )
50
- return options
51
-
52
-
53
- @dataclass
54
- class FsspecCliIndexerConfig(FsspecCliFileConfig):
55
- @staticmethod
56
- def get_cli_options() -> list[click.Option]:
57
- options = super(FsspecCliIndexerConfig, FsspecCliIndexerConfig).get_cli_options()
58
- options.extend(
59
- [
60
- click.Option(
61
- ["--recursive"],
62
- is_flag=True,
63
- default=False,
64
- help="Recursively download files in their respective folders "
65
- "otherwise stop at the files in provided folder level.",
66
- ),
67
- ]
68
- )
69
- return options