unstructured-ingest 0.0.3__py3-none-any.whl → 0.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (123) hide show
  1. unstructured_ingest/__version__.py +1 -1
  2. unstructured_ingest/cli/cli.py +6 -1
  3. unstructured_ingest/cli/cmds/__init__.py +4 -4
  4. unstructured_ingest/cli/cmds/{astra.py → astradb.py} +9 -9
  5. unstructured_ingest/cli/interfaces.py +13 -6
  6. unstructured_ingest/connector/{astra.py → astradb.py} +29 -29
  7. unstructured_ingest/connector/biomed.py +12 -5
  8. unstructured_ingest/connector/confluence.py +3 -3
  9. unstructured_ingest/connector/github.py +3 -2
  10. unstructured_ingest/connector/google_drive.py +1 -2
  11. unstructured_ingest/connector/mongodb.py +1 -2
  12. unstructured_ingest/connector/notion/client.py +31 -16
  13. unstructured_ingest/connector/notion/connector.py +3 -2
  14. unstructured_ingest/connector/registry.py +2 -2
  15. unstructured_ingest/connector/vectara.py +7 -2
  16. unstructured_ingest/interfaces.py +13 -9
  17. unstructured_ingest/pipeline/interfaces.py +8 -3
  18. unstructured_ingest/pipeline/reformat/chunking.py +13 -9
  19. unstructured_ingest/pipeline/reformat/embedding.py +3 -3
  20. unstructured_ingest/runner/__init__.py +2 -2
  21. unstructured_ingest/runner/{astra.py → astradb.py} +7 -7
  22. unstructured_ingest/runner/writers/__init__.py +2 -2
  23. unstructured_ingest/runner/writers/{astra.py → astradb.py} +7 -7
  24. unstructured_ingest/utils/chunking.py +45 -0
  25. unstructured_ingest/utils/dep_check.py +1 -1
  26. unstructured_ingest/utils/google_filetype.py +9 -0
  27. unstructured_ingest/v2/cli/base/cmd.py +57 -13
  28. unstructured_ingest/v2/cli/base/dest.py +21 -12
  29. unstructured_ingest/v2/cli/base/src.py +35 -23
  30. unstructured_ingest/v2/cli/cmds.py +14 -0
  31. unstructured_ingest/v2/cli/{utils.py → utils/click.py} +36 -89
  32. unstructured_ingest/v2/cli/utils/model_conversion.py +199 -0
  33. unstructured_ingest/v2/interfaces/connector.py +5 -7
  34. unstructured_ingest/v2/interfaces/downloader.py +8 -5
  35. unstructured_ingest/v2/interfaces/file_data.py +8 -2
  36. unstructured_ingest/v2/interfaces/indexer.py +3 -4
  37. unstructured_ingest/v2/interfaces/processor.py +10 -10
  38. unstructured_ingest/v2/interfaces/upload_stager.py +3 -3
  39. unstructured_ingest/v2/interfaces/uploader.py +3 -3
  40. unstructured_ingest/v2/pipeline/pipeline.py +1 -5
  41. unstructured_ingest/v2/pipeline/steps/chunk.py +5 -11
  42. unstructured_ingest/v2/pipeline/steps/download.py +13 -11
  43. unstructured_ingest/v2/pipeline/steps/embed.py +5 -11
  44. unstructured_ingest/v2/pipeline/steps/filter.py +1 -6
  45. unstructured_ingest/v2/pipeline/steps/index.py +14 -10
  46. unstructured_ingest/v2/pipeline/steps/partition.py +5 -5
  47. unstructured_ingest/v2/pipeline/steps/stage.py +4 -7
  48. unstructured_ingest/v2/pipeline/steps/uncompress.py +1 -6
  49. unstructured_ingest/v2/pipeline/steps/upload.py +2 -9
  50. unstructured_ingest/v2/processes/__init__.py +18 -0
  51. unstructured_ingest/v2/processes/chunker.py +74 -28
  52. unstructured_ingest/v2/processes/connector_registry.py +8 -2
  53. unstructured_ingest/v2/processes/connectors/__init__.py +13 -3
  54. unstructured_ingest/v2/processes/connectors/{astra.py → astradb.py} +45 -35
  55. unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +30 -27
  56. unstructured_ingest/v2/processes/connectors/chroma.py +30 -21
  57. unstructured_ingest/v2/processes/connectors/couchbase.py +151 -0
  58. unstructured_ingest/v2/processes/connectors/databricks_volumes.py +87 -32
  59. unstructured_ingest/v2/processes/connectors/elasticsearch.py +70 -45
  60. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +39 -16
  61. unstructured_ingest/v2/processes/connectors/fsspec/box.py +15 -13
  62. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +10 -11
  63. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +20 -34
  64. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +38 -13
  65. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +31 -17
  66. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +19 -28
  67. unstructured_ingest/v2/processes/connectors/google_drive.py +40 -34
  68. unstructured_ingest/v2/processes/connectors/local.py +22 -14
  69. unstructured_ingest/v2/processes/connectors/milvus.py +22 -18
  70. unstructured_ingest/v2/processes/connectors/mongodb.py +22 -18
  71. unstructured_ingest/v2/processes/connectors/onedrive.py +17 -14
  72. unstructured_ingest/v2/processes/connectors/opensearch.py +66 -56
  73. unstructured_ingest/v2/processes/connectors/pinecone.py +23 -20
  74. unstructured_ingest/v2/processes/connectors/salesforce.py +26 -18
  75. unstructured_ingest/v2/processes/connectors/sharepoint.py +51 -26
  76. unstructured_ingest/v2/processes/connectors/singlestore.py +11 -15
  77. unstructured_ingest/v2/processes/connectors/sql.py +29 -31
  78. unstructured_ingest/v2/processes/connectors/weaviate.py +22 -13
  79. unstructured_ingest/v2/processes/embedder.py +106 -47
  80. unstructured_ingest/v2/processes/filter.py +11 -5
  81. unstructured_ingest/v2/processes/partitioner.py +79 -33
  82. unstructured_ingest/v2/processes/uncompress.py +3 -3
  83. unstructured_ingest/v2/utils.py +45 -0
  84. unstructured_ingest-0.0.4.dist-info/METADATA +571 -0
  85. {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.4.dist-info}/RECORD +89 -116
  86. {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.4.dist-info}/WHEEL +1 -1
  87. unstructured_ingest/v2/cli/cmds/__init__.py +0 -89
  88. unstructured_ingest/v2/cli/cmds/astra.py +0 -85
  89. unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +0 -72
  90. unstructured_ingest/v2/cli/cmds/chroma.py +0 -108
  91. unstructured_ingest/v2/cli/cmds/databricks_volumes.py +0 -161
  92. unstructured_ingest/v2/cli/cmds/elasticsearch.py +0 -159
  93. unstructured_ingest/v2/cli/cmds/fsspec/azure.py +0 -84
  94. unstructured_ingest/v2/cli/cmds/fsspec/box.py +0 -58
  95. unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +0 -58
  96. unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +0 -69
  97. unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +0 -81
  98. unstructured_ingest/v2/cli/cmds/fsspec/s3.py +0 -84
  99. unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +0 -80
  100. unstructured_ingest/v2/cli/cmds/google_drive.py +0 -74
  101. unstructured_ingest/v2/cli/cmds/local.py +0 -52
  102. unstructured_ingest/v2/cli/cmds/milvus.py +0 -72
  103. unstructured_ingest/v2/cli/cmds/mongodb.py +0 -62
  104. unstructured_ingest/v2/cli/cmds/onedrive.py +0 -91
  105. unstructured_ingest/v2/cli/cmds/opensearch.py +0 -93
  106. unstructured_ingest/v2/cli/cmds/pinecone.py +0 -62
  107. unstructured_ingest/v2/cli/cmds/salesforce.py +0 -79
  108. unstructured_ingest/v2/cli/cmds/sharepoint.py +0 -112
  109. unstructured_ingest/v2/cli/cmds/singlestore.py +0 -96
  110. unstructured_ingest/v2/cli/cmds/sql.py +0 -84
  111. unstructured_ingest/v2/cli/cmds/weaviate.py +0 -100
  112. unstructured_ingest/v2/cli/configs/__init__.py +0 -13
  113. unstructured_ingest/v2/cli/configs/chunk.py +0 -89
  114. unstructured_ingest/v2/cli/configs/embed.py +0 -74
  115. unstructured_ingest/v2/cli/configs/filter.py +0 -28
  116. unstructured_ingest/v2/cli/configs/partition.py +0 -99
  117. unstructured_ingest/v2/cli/configs/processor.py +0 -88
  118. unstructured_ingest/v2/cli/interfaces.py +0 -27
  119. unstructured_ingest/v2/pipeline/utils.py +0 -15
  120. unstructured_ingest-0.0.3.dist-info/METADATA +0 -175
  121. /unstructured_ingest/v2/cli/{cmds/fsspec → utils}/__init__.py +0 -0
  122. {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.4.dist-info}/entry_points.txt +0 -0
  123. {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.4.dist-info}/top_level.txt +0 -0
@@ -1,62 +0,0 @@
1
- from dataclasses import dataclass
2
-
3
- import click
4
-
5
- from unstructured_ingest.v2.cli.base import DestCmd
6
- from unstructured_ingest.v2.cli.interfaces import CliConfig
7
- from unstructured_ingest.v2.processes.connectors.pinecone import CONNECTOR_TYPE
8
-
9
-
10
- @dataclass
11
- class PineconeCliConnectionConfig(CliConfig):
12
- @staticmethod
13
- def get_cli_options() -> list[click.Option]:
14
- options = [
15
- click.Option(
16
- ["--api-key"],
17
- required=True,
18
- type=str,
19
- help="API key for Pinecone.",
20
- ),
21
- click.Option(
22
- ["--index-name"],
23
- required=True,
24
- type=str,
25
- help="Name of the index to connect to. Example: my-index",
26
- ),
27
- click.Option(
28
- ["--environment"],
29
- required=True,
30
- type=str,
31
- help="Environment to connect to. Example: us-east-1",
32
- ),
33
- ]
34
- return options
35
-
36
-
37
- @dataclass
38
- class PineconeCliUploaderConfig(CliConfig):
39
- @staticmethod
40
- def get_cli_options() -> list[click.Option]:
41
- options = [
42
- click.Option(
43
- ["--batch-size"],
44
- default=100,
45
- type=int,
46
- help="Number of records per batch",
47
- ),
48
- click.Option(
49
- ["--num-processes"],
50
- default=4,
51
- type=int,
52
- help="Number of processes to use for uploading",
53
- ),
54
- ]
55
- return options
56
-
57
-
58
- pinecone_dest_cmd = DestCmd(
59
- cmd_name=CONNECTOR_TYPE,
60
- connection_config=PineconeCliConnectionConfig,
61
- uploader_config=PineconeCliUploaderConfig,
62
- )
@@ -1,79 +0,0 @@
1
- from dataclasses import dataclass
2
-
3
- import click
4
-
5
- from unstructured_ingest.v2.cli.base import SrcCmd
6
- from unstructured_ingest.v2.cli.interfaces import CliConfig
7
- from unstructured_ingest.v2.cli.utils import DelimitedString
8
- from unstructured_ingest.v2.processes.connectors.salesforce import (
9
- ACCEPTED_CATEGORIES,
10
- CONNECTOR_TYPE,
11
- )
12
-
13
-
14
- @dataclass
15
- class SalesforceCliConnectionConfig(CliConfig):
16
- @staticmethod
17
- def get_cli_options() -> list[click.Option]:
18
- options = [
19
- click.Option(
20
- ["--username"],
21
- required=True,
22
- type=str,
23
- help="Salesforce username usually looks like an email.",
24
- ),
25
- click.Option(
26
- ["--consumer-key"],
27
- required=True,
28
- type=str,
29
- help="For the Salesforce JWT auth. Found in Consumer Details.",
30
- ),
31
- click.Option(
32
- ["--private-key"],
33
- required=True,
34
- type=str,
35
- help="Path to the private key or its contents for the Salesforce JWT auth. "
36
- "Key file is usually named server.key.",
37
- ),
38
- ]
39
- return options
40
-
41
-
42
- @dataclass
43
- class SalesforceCliIndexerConfig(CliConfig):
44
- @staticmethod
45
- def get_cli_options() -> list[click.Option]:
46
- possible_categories = ACCEPTED_CATEGORIES
47
- options = [
48
- click.Option(
49
- ["--categories"],
50
- default=None,
51
- required=True,
52
- type=DelimitedString(choices=possible_categories),
53
- help="Comma-delimited salesforce categories to download. "
54
- "Currently only {}.".format(", ".join(possible_categories)),
55
- ),
56
- ]
57
- return options
58
-
59
-
60
- @dataclass
61
- class SalesforceCliDownloadConfig(CliConfig):
62
- @staticmethod
63
- def get_cli_options() -> list[click.Option]:
64
- options = [
65
- click.Option(
66
- ["--download-dir"],
67
- help="Where files are downloaded to, defaults to a location at"
68
- "`$HOME/.cache/unstructured_ingest/<connector name>/<SHA256>`.",
69
- ),
70
- ]
71
- return options
72
-
73
-
74
- salesforce_src_cmd = SrcCmd(
75
- cmd_name=CONNECTOR_TYPE,
76
- connection_config=SalesforceCliConnectionConfig,
77
- indexer_config=SalesforceCliIndexerConfig,
78
- downloader_config=SalesforceCliDownloadConfig,
79
- )
@@ -1,112 +0,0 @@
1
- from dataclasses import dataclass
2
-
3
- import click
4
-
5
- from unstructured_ingest.v2.cli.base import SrcCmd
6
- from unstructured_ingest.v2.cli.interfaces import CliConfig
7
- from unstructured_ingest.v2.processes.connectors.sharepoint import CONNECTOR_TYPE
8
-
9
-
10
- @dataclass
11
- class SharepointCliConnectionConfig(CliConfig):
12
- @staticmethod
13
- def get_cli_options() -> list[click.Option]:
14
- options = [
15
- click.Option(
16
- ["--client-id"],
17
- default=None,
18
- type=str,
19
- help="Sharepoint app client ID",
20
- ),
21
- click.Option(
22
- ["--client-cred"],
23
- default=None,
24
- type=str,
25
- help="Sharepoint app secret",
26
- ),
27
- click.Option(
28
- ["--site"],
29
- default=None,
30
- type=str,
31
- help="Sharepoint site url. Process either base url e.g \
32
- https://[tenant].sharepoint.com or relative sites \
33
- https://[tenant].sharepoint.com/sites/<site_name>. \
34
- To process all sites within the tenant pass a site url as \
35
- https://[tenant]-admin.sharepoint.com.\
36
- This requires the app to be registered at a tenant level",
37
- ),
38
- click.Option(
39
- ["--permissions-application-id"],
40
- type=str,
41
- help="Microsoft Graph API application id",
42
- ),
43
- click.Option(
44
- ["--permissions-client-cred"],
45
- type=str,
46
- help="Microsoft Graph API application credentials",
47
- ),
48
- click.Option(
49
- ["--permissions-tenant"],
50
- type=str,
51
- help="e.g https://contoso.onmicrosoft.com to get permissions data within tenant.",
52
- ),
53
- ]
54
- return options
55
-
56
-
57
- @dataclass
58
- class SharepointCliIndexerConfig(CliConfig):
59
- @staticmethod
60
- def get_cli_options() -> list[click.Option]:
61
- options = [
62
- click.Option(
63
- ["--path"],
64
- default=None,
65
- type=str,
66
- help="Path from which to start parsing files. If the connector is to \
67
- process all sites within the tenant this filter will be applied to \
68
- all sites document libraries.",
69
- ),
70
- click.Option(
71
- ["--recursive"],
72
- is_flag=True,
73
- default=False,
74
- help="Recursively download files in their respective folders "
75
- "otherwise stop at the files in provided folder level.",
76
- ),
77
- click.Option(
78
- ["--omit-files"],
79
- is_flag=True,
80
- default=False,
81
- help="Don't process files.",
82
- ),
83
- click.Option(
84
- ["--omit-pages"],
85
- is_flag=True,
86
- default=False,
87
- help="Don't process site pages.",
88
- ),
89
- ]
90
- return options
91
-
92
-
93
- @dataclass
94
- class SharepointCliDownloadConfig(CliConfig):
95
- @staticmethod
96
- def get_cli_options() -> list[click.Option]:
97
- options = [
98
- click.Option(
99
- ["--download-dir"],
100
- help="Where files are downloaded to, defaults to a location at"
101
- "`$HOME/.cache/unstructured_ingest/<connector name>/<SHA256>`.",
102
- ),
103
- ]
104
- return options
105
-
106
-
107
- sharepoint_drive_src_cmd = SrcCmd(
108
- cmd_name=CONNECTOR_TYPE,
109
- connection_config=SharepointCliConnectionConfig,
110
- indexer_config=SharepointCliIndexerConfig,
111
- downloader_config=SharepointCliDownloadConfig,
112
- )
@@ -1,96 +0,0 @@
1
- from dataclasses import dataclass
2
-
3
- import click
4
-
5
- from unstructured_ingest.v2.cli.base import DestCmd
6
- from unstructured_ingest.v2.cli.interfaces import CliConfig
7
- from unstructured_ingest.v2.processes.connectors.singlestore import CONNECTOR_TYPE
8
-
9
-
10
- @dataclass
11
- class SingleStoreCliConnectionConfig(CliConfig):
12
- @staticmethod
13
- def get_cli_options() -> list[click.Option]:
14
- options = [
15
- click.Option(
16
- ["--host"],
17
- required=False,
18
- type=str,
19
- default=None,
20
- help="SingleStore host",
21
- ),
22
- click.Option(
23
- ["--port"],
24
- required=False,
25
- type=int,
26
- default=None,
27
- help="SingleStore port",
28
- ),
29
- click.Option(
30
- ["--user"],
31
- required=False,
32
- type=str,
33
- default=None,
34
- help="SingleStore user",
35
- ),
36
- click.Option(
37
- ["--password"],
38
- required=False,
39
- type=str,
40
- default=None,
41
- help="SingleStore password",
42
- ),
43
- click.Option(
44
- ["--database"],
45
- required=False,
46
- type=str,
47
- default=None,
48
- help="SingleStore database",
49
- ),
50
- ]
51
- return options
52
-
53
-
54
- @dataclass
55
- class SingleStoreCliUploaderConfig(CliConfig):
56
- @staticmethod
57
- def get_cli_options() -> list[click.Option]:
58
- options = [
59
- click.Option(
60
- ["--drop-empty-cols"],
61
- required=False,
62
- type=bool,
63
- is_flag=True,
64
- default=False,
65
- help="Drop any columns that have no data",
66
- ),
67
- ]
68
- return options
69
-
70
-
71
- @dataclass
72
- class SingleStoreCliUploadStagerConfig(CliConfig):
73
- @staticmethod
74
- def get_cli_options() -> list[click.Option]:
75
- return [
76
- click.Option(
77
- ["--table-name"],
78
- required=False,
79
- type=str,
80
- help="SingleStore table to write contents to",
81
- ),
82
- click.Option(
83
- ["--batch-size"],
84
- required=False,
85
- type=click.IntRange(min=1),
86
- help="Batch size when writing to SingleStore",
87
- ),
88
- ]
89
-
90
-
91
- singlestore_dest_cmd = DestCmd(
92
- cmd_name=CONNECTOR_TYPE,
93
- connection_config=SingleStoreCliConnectionConfig,
94
- uploader_config=SingleStoreCliUploaderConfig,
95
- upload_stager_config=SingleStoreCliUploadStagerConfig,
96
- )
@@ -1,84 +0,0 @@
1
- from dataclasses import dataclass
2
-
3
- import click
4
-
5
- from unstructured_ingest.v2.cli.base import DestCmd
6
- from unstructured_ingest.v2.cli.interfaces import CliConfig
7
- from unstructured_ingest.v2.processes.connectors.sql import CONNECTOR_TYPE
8
-
9
- SQL_DRIVERS = {"postgresql", "sqlite"}
10
-
11
-
12
- @dataclass
13
- class SQLCliConnectionConfig(CliConfig):
14
- @staticmethod
15
- def get_cli_options() -> list[click.Option]:
16
- options = [
17
- click.Option(
18
- ["--db-type"],
19
- required=True,
20
- type=click.Choice(SQL_DRIVERS),
21
- help="Type of the database backend",
22
- ),
23
- click.Option(
24
- ["--username"],
25
- default=None,
26
- type=str,
27
- help="DB username",
28
- ),
29
- click.Option(
30
- ["--password"],
31
- default=None,
32
- type=str,
33
- help="DB password",
34
- ),
35
- click.Option(
36
- ["--host"],
37
- default=None,
38
- type=str,
39
- help="DB host",
40
- ),
41
- click.Option(
42
- ["--port"],
43
- default=None,
44
- type=int,
45
- help="DB host connection port",
46
- ),
47
- click.Option(
48
- ["--database"],
49
- default=None,
50
- type=str,
51
- help="Database name. For sqlite databases, this is the path to the .db file.",
52
- ),
53
- ]
54
- return options
55
-
56
-
57
- @dataclass
58
- class SQLCliUploaderConfig(CliConfig):
59
- @staticmethod
60
- def get_cli_options() -> list[click.Option]:
61
- options = [
62
- click.Option(
63
- ["--batch-size"],
64
- default=100,
65
- type=int,
66
- help="Number of records per batch",
67
- )
68
- ]
69
- return options
70
-
71
-
72
- @dataclass
73
- class SQLCliUploadStagerConfig(CliConfig):
74
- @staticmethod
75
- def get_cli_options() -> list[click.Option]:
76
- return []
77
-
78
-
79
- sql_dest_cmd = DestCmd(
80
- cmd_name=CONNECTOR_TYPE,
81
- connection_config=SQLCliConnectionConfig,
82
- uploader_config=SQLCliUploaderConfig,
83
- upload_stager_config=SQLCliUploadStagerConfig,
84
- )
@@ -1,100 +0,0 @@
1
- from dataclasses import dataclass
2
-
3
- import click
4
-
5
- from unstructured_ingest.v2.cli.base import DestCmd
6
- from unstructured_ingest.v2.cli.interfaces import CliConfig
7
- from unstructured_ingest.v2.cli.utils import DelimitedString
8
- from unstructured_ingest.v2.processes.connectors.weaviate import CONNECTOR_TYPE
9
-
10
-
11
- @dataclass
12
- class WeaviateCliConnectionConfig(CliConfig):
13
- @staticmethod
14
- def get_cli_options() -> list[click.Option]:
15
- options = [
16
- click.Option(
17
- ["--host-url"],
18
- required=True,
19
- help="Weaviate instance url",
20
- ),
21
- click.Option(
22
- ["--class-name"],
23
- default=None,
24
- type=str,
25
- help="Name of the class to push the records into, e.g: Pdf-elements",
26
- ),
27
- click.Option(
28
- ["--access-token"], default=None, type=str, help="Used to create the bearer token."
29
- ),
30
- click.Option(
31
- ["--refresh-token"],
32
- default=None,
33
- type=str,
34
- help="Will tie this value to the bearer token. If not provided, "
35
- "the authentication will expire once the lifetime of the access token is up.",
36
- ),
37
- click.Option(
38
- ["--api-key"],
39
- default=None,
40
- type=str,
41
- ),
42
- click.Option(
43
- ["--client-secret"],
44
- default=None,
45
- type=str,
46
- ),
47
- click.Option(
48
- ["--scope"],
49
- default=None,
50
- type=DelimitedString(),
51
- ),
52
- click.Option(
53
- ["--username"],
54
- default=None,
55
- type=str,
56
- ),
57
- click.Option(
58
- ["--password"],
59
- default=None,
60
- type=str,
61
- ),
62
- click.Option(
63
- ["--anonymous"],
64
- is_flag=True,
65
- default=False,
66
- type=bool,
67
- help="if set, all auth values will be ignored",
68
- ),
69
- ]
70
- return options
71
-
72
-
73
- @dataclass
74
- class WeaviateCliUploaderConfig(CliConfig):
75
- @staticmethod
76
- def get_cli_options() -> list[click.Option]:
77
- options = [
78
- click.Option(
79
- ["--batch-size"],
80
- default=100,
81
- type=int,
82
- help="Number of records per batch",
83
- )
84
- ]
85
- return options
86
-
87
-
88
- @dataclass
89
- class WeaviateCliUploadStagerConfig(CliConfig):
90
- @staticmethod
91
- def get_cli_options() -> list[click.Option]:
92
- return []
93
-
94
-
95
- weaviate_dest_cmd = DestCmd(
96
- cmd_name=CONNECTOR_TYPE,
97
- connection_config=WeaviateCliConnectionConfig,
98
- uploader_config=WeaviateCliUploaderConfig,
99
- upload_stager_config=WeaviateCliUploadStagerConfig,
100
- )
@@ -1,13 +0,0 @@
1
- from .chunk import ChunkerCliConfig
2
- from .embed import EmbedderCliConfig
3
- from .filter import FilterCliConfig
4
- from .partition import PartitionerCliConfig
5
- from .processor import ProcessorCliConfig
6
-
7
- __all__ = [
8
- "ChunkerCliConfig",
9
- "ProcessorCliConfig",
10
- "PartitionerCliConfig",
11
- "EmbedderCliConfig",
12
- "FilterCliConfig",
13
- ]
@@ -1,89 +0,0 @@
1
- from dataclasses import dataclass
2
-
3
- import click
4
- from unstructured.chunking import CHUNK_MAX_CHARS_DEFAULT, CHUNK_MULTI_PAGE_DEFAULT
5
-
6
- from unstructured_ingest.v2.cli.interfaces import CliConfig
7
-
8
-
9
- @dataclass
10
- class ChunkerCliConfig(CliConfig):
11
- @staticmethod
12
- def get_cli_options() -> list[click.Option]:
13
- options = [
14
- click.Option(
15
- ["--chunking-strategy"],
16
- type=str,
17
- default=None,
18
- help="The rule-set to use to form chunks. Omit to disable chunking.",
19
- ),
20
- click.Option(
21
- ["--chunk-combine-text-under-n-chars"],
22
- type=int,
23
- help=(
24
- "Combine consecutive chunks when the first does not exceed this length and"
25
- " the second will fit without exceeding the hard-maximum length. Only"
26
- " operative for 'by_title' chunking-strategy."
27
- ),
28
- ),
29
- click.Option(
30
- ["--chunk-include-orig-elements/--chunk-no-include-orig-elements"],
31
- is_flag=True,
32
- default=True,
33
- help=(
34
- "When chunking, add the original elements consolidated to form each chunk to"
35
- " `.metadata.orig_elements` on that chunk."
36
- ),
37
- ),
38
- click.Option(
39
- ["--chunk-max-characters"],
40
- type=int,
41
- default=CHUNK_MAX_CHARS_DEFAULT,
42
- show_default=True,
43
- help=(
44
- "Hard maximum chunk length. No chunk will exceed this length. An oversized"
45
- " element will be divided by text-splitting to fit this window."
46
- ),
47
- ),
48
- click.Option(
49
- ["--chunk-multipage-sections/--chunk-no-multipage-sections"],
50
- is_flag=True,
51
- default=CHUNK_MULTI_PAGE_DEFAULT,
52
- help=(
53
- "Ignore page boundaries when chunking such that elements from two different"
54
- " pages can appear in the same chunk. Only operative for 'by_title'"
55
- " chunking-strategy."
56
- ),
57
- ),
58
- click.Option(
59
- ["--chunk-new-after-n-chars"],
60
- type=int,
61
- help=(
62
- "Soft-maximum chunk length. Another element will not be added to a chunk of"
63
- " this length even when it would fit without exceeding the hard-maximum"
64
- " length."
65
- ),
66
- ),
67
- click.Option(
68
- ["--chunk-overlap"],
69
- type=int,
70
- default=0,
71
- show_default=True,
72
- help=(
73
- "Prefix chunk text with last overlap=N characters of prior chunk. Only"
74
- " applies to oversized chunks divided by text-splitting. To apply overlap to"
75
- " non-oversized chunks use the --overlap-all option."
76
- ),
77
- ),
78
- click.Option(
79
- ["--chunk-overlap-all"],
80
- is_flag=True,
81
- default=False,
82
- help=(
83
- "Apply overlap to chunks formed from whole elements as well as those formed"
84
- " by text-splitting oversized elements. Overlap length is take from --overlap"
85
- " option value."
86
- ),
87
- ),
88
- ]
89
- return options