unstructured-ingest 1.2.32__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__init__.py +1 -0
- unstructured_ingest/__version__.py +1 -0
- unstructured_ingest/cli/README.md +28 -0
- unstructured_ingest/cli/__init__.py +0 -0
- unstructured_ingest/cli/base/__init__.py +4 -0
- unstructured_ingest/cli/base/cmd.py +269 -0
- unstructured_ingest/cli/base/dest.py +84 -0
- unstructured_ingest/cli/base/importer.py +34 -0
- unstructured_ingest/cli/base/src.py +75 -0
- unstructured_ingest/cli/cli.py +24 -0
- unstructured_ingest/cli/cmds.py +14 -0
- unstructured_ingest/cli/utils/__init__.py +0 -0
- unstructured_ingest/cli/utils/click.py +237 -0
- unstructured_ingest/cli/utils/model_conversion.py +222 -0
- unstructured_ingest/data_types/__init__.py +0 -0
- unstructured_ingest/data_types/entities.py +17 -0
- unstructured_ingest/data_types/file_data.py +116 -0
- unstructured_ingest/embed/__init__.py +0 -0
- unstructured_ingest/embed/azure_openai.py +63 -0
- unstructured_ingest/embed/bedrock.py +323 -0
- unstructured_ingest/embed/huggingface.py +69 -0
- unstructured_ingest/embed/interfaces.py +146 -0
- unstructured_ingest/embed/mixedbreadai.py +134 -0
- unstructured_ingest/embed/octoai.py +133 -0
- unstructured_ingest/embed/openai.py +142 -0
- unstructured_ingest/embed/togetherai.py +116 -0
- unstructured_ingest/embed/vertexai.py +109 -0
- unstructured_ingest/embed/voyageai.py +130 -0
- unstructured_ingest/error.py +156 -0
- unstructured_ingest/errors_v2.py +156 -0
- unstructured_ingest/interfaces/__init__.py +27 -0
- unstructured_ingest/interfaces/connector.py +56 -0
- unstructured_ingest/interfaces/downloader.py +90 -0
- unstructured_ingest/interfaces/indexer.py +29 -0
- unstructured_ingest/interfaces/process.py +22 -0
- unstructured_ingest/interfaces/processor.py +88 -0
- unstructured_ingest/interfaces/upload_stager.py +89 -0
- unstructured_ingest/interfaces/uploader.py +67 -0
- unstructured_ingest/logger.py +39 -0
- unstructured_ingest/main.py +11 -0
- unstructured_ingest/otel.py +128 -0
- unstructured_ingest/pipeline/__init__.py +0 -0
- unstructured_ingest/pipeline/interfaces.py +211 -0
- unstructured_ingest/pipeline/otel.py +32 -0
- unstructured_ingest/pipeline/pipeline.py +408 -0
- unstructured_ingest/pipeline/steps/__init__.py +0 -0
- unstructured_ingest/pipeline/steps/chunk.py +78 -0
- unstructured_ingest/pipeline/steps/download.py +206 -0
- unstructured_ingest/pipeline/steps/embed.py +77 -0
- unstructured_ingest/pipeline/steps/filter.py +35 -0
- unstructured_ingest/pipeline/steps/index.py +86 -0
- unstructured_ingest/pipeline/steps/partition.py +77 -0
- unstructured_ingest/pipeline/steps/stage.py +65 -0
- unstructured_ingest/pipeline/steps/uncompress.py +50 -0
- unstructured_ingest/pipeline/steps/upload.py +58 -0
- unstructured_ingest/processes/__init__.py +18 -0
- unstructured_ingest/processes/chunker.py +131 -0
- unstructured_ingest/processes/connector_registry.py +69 -0
- unstructured_ingest/processes/connectors/__init__.py +129 -0
- unstructured_ingest/processes/connectors/airtable.py +238 -0
- unstructured_ingest/processes/connectors/assets/__init__.py +0 -0
- unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +9 -0
- unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
- unstructured_ingest/processes/connectors/astradb.py +592 -0
- unstructured_ingest/processes/connectors/azure_ai_search.py +275 -0
- unstructured_ingest/processes/connectors/chroma.py +193 -0
- unstructured_ingest/processes/connectors/confluence.py +527 -0
- unstructured_ingest/processes/connectors/couchbase.py +336 -0
- unstructured_ingest/processes/connectors/databricks/__init__.py +58 -0
- unstructured_ingest/processes/connectors/databricks/volumes.py +233 -0
- unstructured_ingest/processes/connectors/databricks/volumes_aws.py +93 -0
- unstructured_ingest/processes/connectors/databricks/volumes_azure.py +108 -0
- unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +91 -0
- unstructured_ingest/processes/connectors/databricks/volumes_native.py +92 -0
- unstructured_ingest/processes/connectors/databricks/volumes_table.py +187 -0
- unstructured_ingest/processes/connectors/delta_table.py +310 -0
- unstructured_ingest/processes/connectors/discord.py +161 -0
- unstructured_ingest/processes/connectors/duckdb/__init__.py +15 -0
- unstructured_ingest/processes/connectors/duckdb/base.py +103 -0
- unstructured_ingest/processes/connectors/duckdb/duckdb.py +130 -0
- unstructured_ingest/processes/connectors/duckdb/motherduck.py +130 -0
- unstructured_ingest/processes/connectors/elasticsearch/__init__.py +19 -0
- unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +478 -0
- unstructured_ingest/processes/connectors/elasticsearch/opensearch.py +523 -0
- unstructured_ingest/processes/connectors/fsspec/__init__.py +37 -0
- unstructured_ingest/processes/connectors/fsspec/azure.py +203 -0
- unstructured_ingest/processes/connectors/fsspec/box.py +176 -0
- unstructured_ingest/processes/connectors/fsspec/dropbox.py +238 -0
- unstructured_ingest/processes/connectors/fsspec/fsspec.py +475 -0
- unstructured_ingest/processes/connectors/fsspec/gcs.py +203 -0
- unstructured_ingest/processes/connectors/fsspec/s3.py +253 -0
- unstructured_ingest/processes/connectors/fsspec/sftp.py +177 -0
- unstructured_ingest/processes/connectors/fsspec/utils.py +17 -0
- unstructured_ingest/processes/connectors/github.py +226 -0
- unstructured_ingest/processes/connectors/gitlab.py +270 -0
- unstructured_ingest/processes/connectors/google_drive.py +848 -0
- unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py +10 -0
- unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +367 -0
- unstructured_ingest/processes/connectors/jira.py +522 -0
- unstructured_ingest/processes/connectors/kafka/__init__.py +17 -0
- unstructured_ingest/processes/connectors/kafka/cloud.py +121 -0
- unstructured_ingest/processes/connectors/kafka/kafka.py +275 -0
- unstructured_ingest/processes/connectors/kafka/local.py +103 -0
- unstructured_ingest/processes/connectors/kdbai.py +156 -0
- unstructured_ingest/processes/connectors/lancedb/__init__.py +30 -0
- unstructured_ingest/processes/connectors/lancedb/aws.py +43 -0
- unstructured_ingest/processes/connectors/lancedb/azure.py +43 -0
- unstructured_ingest/processes/connectors/lancedb/cloud.py +42 -0
- unstructured_ingest/processes/connectors/lancedb/gcp.py +44 -0
- unstructured_ingest/processes/connectors/lancedb/lancedb.py +181 -0
- unstructured_ingest/processes/connectors/lancedb/local.py +44 -0
- unstructured_ingest/processes/connectors/local.py +227 -0
- unstructured_ingest/processes/connectors/milvus.py +311 -0
- unstructured_ingest/processes/connectors/mongodb.py +389 -0
- unstructured_ingest/processes/connectors/neo4j.py +534 -0
- unstructured_ingest/processes/connectors/notion/__init__.py +0 -0
- unstructured_ingest/processes/connectors/notion/client.py +349 -0
- unstructured_ingest/processes/connectors/notion/connector.py +350 -0
- unstructured_ingest/processes/connectors/notion/helpers.py +448 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py +3 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/_common.py +102 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/_wrapper.py +126 -0
- unstructured_ingest/processes/connectors/notion/ingest_backoff/types.py +24 -0
- unstructured_ingest/processes/connectors/notion/interfaces.py +32 -0
- unstructured_ingest/processes/connectors/notion/types/__init__.py +0 -0
- unstructured_ingest/processes/connectors/notion/types/block.py +96 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/__init__.py +63 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/bookmark.py +40 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/breadcrumb.py +21 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/bulleted_list_item.py +31 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/callout.py +131 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/child_database.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/child_page.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/code.py +43 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/column_list.py +35 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/divider.py +22 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/embed.py +36 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/equation.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/file.py +49 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/heading.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/image.py +21 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/link_preview.py +24 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/link_to_page.py +29 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py +29 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py +31 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py +49 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/quote.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +109 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/table.py +60 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py +23 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/template.py +30 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/todo.py +42 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/toggle.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/unsupported.py +20 -0
- unstructured_ingest/processes/connectors/notion/types/blocks/video.py +22 -0
- unstructured_ingest/processes/connectors/notion/types/database.py +73 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/__init__.py +125 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/checkbox.py +39 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/created_by.py +36 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/created_time.py +35 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/date.py +42 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/email.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/files.py +38 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/formula.py +50 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_by.py +34 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/last_edited_time.py +35 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/multiselect.py +74 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/number.py +50 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/people.py +42 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/phone_number.py +37 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/relation.py +68 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/rich_text.py +44 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/rollup.py +57 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/select.py +70 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/status.py +82 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/title.py +38 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/unique_id.py +51 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/url.py +38 -0
- unstructured_ingest/processes/connectors/notion/types/database_properties/verification.py +79 -0
- unstructured_ingest/processes/connectors/notion/types/date.py +29 -0
- unstructured_ingest/processes/connectors/notion/types/file.py +54 -0
- unstructured_ingest/processes/connectors/notion/types/page.py +52 -0
- unstructured_ingest/processes/connectors/notion/types/parent.py +66 -0
- unstructured_ingest/processes/connectors/notion/types/rich_text.py +189 -0
- unstructured_ingest/processes/connectors/notion/types/user.py +83 -0
- unstructured_ingest/processes/connectors/onedrive.py +485 -0
- unstructured_ingest/processes/connectors/outlook.py +242 -0
- unstructured_ingest/processes/connectors/pinecone.py +400 -0
- unstructured_ingest/processes/connectors/qdrant/__init__.py +16 -0
- unstructured_ingest/processes/connectors/qdrant/cloud.py +59 -0
- unstructured_ingest/processes/connectors/qdrant/local.py +58 -0
- unstructured_ingest/processes/connectors/qdrant/qdrant.py +163 -0
- unstructured_ingest/processes/connectors/qdrant/server.py +60 -0
- unstructured_ingest/processes/connectors/redisdb.py +214 -0
- unstructured_ingest/processes/connectors/salesforce.py +307 -0
- unstructured_ingest/processes/connectors/sharepoint.py +282 -0
- unstructured_ingest/processes/connectors/slack.py +249 -0
- unstructured_ingest/processes/connectors/sql/__init__.py +41 -0
- unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +228 -0
- unstructured_ingest/processes/connectors/sql/postgres.py +168 -0
- unstructured_ingest/processes/connectors/sql/singlestore.py +176 -0
- unstructured_ingest/processes/connectors/sql/snowflake.py +298 -0
- unstructured_ingest/processes/connectors/sql/sql.py +456 -0
- unstructured_ingest/processes/connectors/sql/sqlite.py +179 -0
- unstructured_ingest/processes/connectors/sql/teradata.py +254 -0
- unstructured_ingest/processes/connectors/sql/vastdb.py +263 -0
- unstructured_ingest/processes/connectors/utils.py +60 -0
- unstructured_ingest/processes/connectors/vectara.py +348 -0
- unstructured_ingest/processes/connectors/weaviate/__init__.py +22 -0
- unstructured_ingest/processes/connectors/weaviate/cloud.py +166 -0
- unstructured_ingest/processes/connectors/weaviate/embedded.py +90 -0
- unstructured_ingest/processes/connectors/weaviate/local.py +73 -0
- unstructured_ingest/processes/connectors/weaviate/weaviate.py +337 -0
- unstructured_ingest/processes/connectors/zendesk/__init__.py +0 -0
- unstructured_ingest/processes/connectors/zendesk/client.py +314 -0
- unstructured_ingest/processes/connectors/zendesk/zendesk.py +241 -0
- unstructured_ingest/processes/embedder.py +203 -0
- unstructured_ingest/processes/filter.py +60 -0
- unstructured_ingest/processes/partitioner.py +233 -0
- unstructured_ingest/processes/uncompress.py +61 -0
- unstructured_ingest/processes/utils/__init__.py +8 -0
- unstructured_ingest/processes/utils/blob_storage.py +32 -0
- unstructured_ingest/processes/utils/logging/connector.py +365 -0
- unstructured_ingest/processes/utils/logging/sanitizer.py +117 -0
- unstructured_ingest/unstructured_api.py +140 -0
- unstructured_ingest/utils/__init__.py +5 -0
- unstructured_ingest/utils/chunking.py +56 -0
- unstructured_ingest/utils/compression.py +72 -0
- unstructured_ingest/utils/constants.py +2 -0
- unstructured_ingest/utils/data_prep.py +216 -0
- unstructured_ingest/utils/dep_check.py +78 -0
- unstructured_ingest/utils/filesystem.py +27 -0
- unstructured_ingest/utils/html.py +174 -0
- unstructured_ingest/utils/ndjson.py +52 -0
- unstructured_ingest/utils/pydantic_models.py +52 -0
- unstructured_ingest/utils/string_and_date_utils.py +74 -0
- unstructured_ingest/utils/table.py +80 -0
- unstructured_ingest/utils/tls.py +15 -0
- unstructured_ingest-1.2.32.dist-info/METADATA +235 -0
- unstructured_ingest-1.2.32.dist-info/RECORD +243 -0
- unstructured_ingest-1.2.32.dist-info/WHEEL +4 -0
- unstructured_ingest-1.2.32.dist-info/entry_points.txt +2 -0
- unstructured_ingest-1.2.32.dist-info/licenses/LICENSE.md +201 -0
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
from abc import ABC
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Any, TypeVar
|
|
5
|
+
|
|
6
|
+
from pydantic import BaseModel
|
|
7
|
+
|
|
8
|
+
from unstructured_ingest.data_types.file_data import FileData
|
|
9
|
+
from unstructured_ingest.interfaces import BaseProcess
|
|
10
|
+
from unstructured_ingest.utils import ndjson
|
|
11
|
+
from unstructured_ingest.utils.data_prep import get_json_data, write_data
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class UploadStagerConfig(BaseModel):
|
|
15
|
+
pass
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
UploadStagerConfigT = TypeVar("UploadStagerConfigT", bound=UploadStagerConfig)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class UploadStager(BaseProcess, ABC):
|
|
23
|
+
upload_stager_config: UploadStagerConfigT
|
|
24
|
+
|
|
25
|
+
def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
|
|
26
|
+
return element_dict
|
|
27
|
+
|
|
28
|
+
def get_output_path(self, output_filename: str, output_dir: Path) -> Path:
|
|
29
|
+
output_path = Path(output_filename)
|
|
30
|
+
output_filename = f"{Path(output_filename).stem}{output_path.suffix}"
|
|
31
|
+
output_path = Path(output_dir) / Path(f"{output_filename}")
|
|
32
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
33
|
+
return output_path
|
|
34
|
+
|
|
35
|
+
def stream_update(self, input_file: Path, output_file: Path, file_data: FileData) -> None:
|
|
36
|
+
with input_file.open() as in_f:
|
|
37
|
+
reader = ndjson.reader(in_f)
|
|
38
|
+
with output_file.open("w") as out_f:
|
|
39
|
+
writer = ndjson.writer(out_f)
|
|
40
|
+
for element in reader:
|
|
41
|
+
conformed_element = self.conform_dict(element_dict=element, file_data=file_data)
|
|
42
|
+
writer.write(row=conformed_element)
|
|
43
|
+
writer.f.flush()
|
|
44
|
+
|
|
45
|
+
def process_whole(self, input_file: Path, output_file: Path, file_data: FileData) -> None:
|
|
46
|
+
elements_contents = get_json_data(path=input_file)
|
|
47
|
+
|
|
48
|
+
conformed_elements = [
|
|
49
|
+
self.conform_dict(element_dict=element, file_data=file_data)
|
|
50
|
+
for element in elements_contents
|
|
51
|
+
]
|
|
52
|
+
write_data(path=output_file, data=conformed_elements)
|
|
53
|
+
|
|
54
|
+
def run(
|
|
55
|
+
self,
|
|
56
|
+
elements_filepath: Path,
|
|
57
|
+
file_data: FileData,
|
|
58
|
+
output_dir: Path,
|
|
59
|
+
output_filename: str,
|
|
60
|
+
**kwargs: Any,
|
|
61
|
+
) -> Path:
|
|
62
|
+
output_file = self.get_output_path(output_filename=output_filename, output_dir=output_dir)
|
|
63
|
+
if elements_filepath.suffix == ".ndjson":
|
|
64
|
+
self.stream_update(
|
|
65
|
+
input_file=elements_filepath, output_file=output_file, file_data=file_data
|
|
66
|
+
)
|
|
67
|
+
elif elements_filepath.suffix == ".json":
|
|
68
|
+
self.process_whole(
|
|
69
|
+
input_file=elements_filepath, output_file=output_file, file_data=file_data
|
|
70
|
+
)
|
|
71
|
+
else:
|
|
72
|
+
raise ValueError(f"Unsupported file extension: {elements_filepath}")
|
|
73
|
+
return output_file
|
|
74
|
+
|
|
75
|
+
async def run_async(
|
|
76
|
+
self,
|
|
77
|
+
elements_filepath: Path,
|
|
78
|
+
file_data: FileData,
|
|
79
|
+
output_dir: Path,
|
|
80
|
+
output_filename: str,
|
|
81
|
+
**kwargs: Any,
|
|
82
|
+
) -> Path:
|
|
83
|
+
return self.run(
|
|
84
|
+
elements_filepath=elements_filepath,
|
|
85
|
+
output_dir=output_dir,
|
|
86
|
+
output_filename=output_filename,
|
|
87
|
+
file_data=file_data,
|
|
88
|
+
**kwargs,
|
|
89
|
+
)
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
from abc import ABC
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Any, TypeVar
|
|
5
|
+
|
|
6
|
+
from pydantic import BaseModel
|
|
7
|
+
|
|
8
|
+
from unstructured_ingest.data_types.file_data import FileData
|
|
9
|
+
from unstructured_ingest.interfaces import BaseConnector, BaseProcess
|
|
10
|
+
from unstructured_ingest.utils.data_prep import get_json_data
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class UploaderConfig(BaseModel):
|
|
14
|
+
pass
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
UploaderConfigT = TypeVar("UploaderConfigT", bound=UploaderConfig)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass
|
|
21
|
+
class UploadContent:
|
|
22
|
+
path: Path
|
|
23
|
+
file_data: FileData
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass
|
|
27
|
+
class Uploader(BaseProcess, BaseConnector, ABC):
|
|
28
|
+
upload_config: UploaderConfigT
|
|
29
|
+
connector_type: str
|
|
30
|
+
|
|
31
|
+
def is_async(self) -> bool:
|
|
32
|
+
return False
|
|
33
|
+
|
|
34
|
+
def is_batch(self) -> bool:
|
|
35
|
+
return False
|
|
36
|
+
|
|
37
|
+
def run_batch(self, contents: list[UploadContent], **kwargs: Any) -> None:
|
|
38
|
+
raise NotImplementedError()
|
|
39
|
+
|
|
40
|
+
def create_destination(
|
|
41
|
+
self, destination_name: str = "unstructuredautocreated", **kwargs: Any
|
|
42
|
+
) -> bool:
|
|
43
|
+
# Update the uploader config if needed with a new destination that gets created.
|
|
44
|
+
# Return a flag on if anything was created or not.
|
|
45
|
+
return False
|
|
46
|
+
|
|
47
|
+
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
48
|
+
data = get_json_data(path=path)
|
|
49
|
+
self.run_data(data=data, file_data=file_data, **kwargs)
|
|
50
|
+
|
|
51
|
+
async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
52
|
+
data = get_json_data(path=path)
|
|
53
|
+
await self.run_data_async(data=data, file_data=file_data, **kwargs)
|
|
54
|
+
|
|
55
|
+
def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
|
|
56
|
+
raise NotImplementedError()
|
|
57
|
+
|
|
58
|
+
async def run_data_async(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
|
|
59
|
+
return self.run_data(data=data, file_data=file_data, **kwargs)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
@dataclass
|
|
63
|
+
class VectorDBUploader(Uploader, ABC):
|
|
64
|
+
def create_destination(
|
|
65
|
+
self, vector_length: int, destination_name: str = "unstructuredautocreated", **kwargs: Any
|
|
66
|
+
) -> bool:
|
|
67
|
+
return False
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
logger = logging.getLogger("unstructured_ingest")
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def remove_root_handlers(logger: logging.Logger) -> None:
|
|
7
|
+
# NOTE(robinson): in some environments such as Google Colab, there is a root handler
|
|
8
|
+
# that doesn't not mask secrets, meaning sensitive info such as api keys appear in logs.
|
|
9
|
+
# Removing these when they exist prevents this behavior
|
|
10
|
+
if logger.root.hasHandlers():
|
|
11
|
+
for handler in logger.root.handlers:
|
|
12
|
+
logger.root.removeHandler(handler)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def ingest_log_streaming_init(level: int) -> None:
|
|
16
|
+
handler = logging.StreamHandler()
|
|
17
|
+
handler.name = "ingest_log_handler"
|
|
18
|
+
formatter = logging.Formatter("%(asctime)s %(processName)-10s %(levelname)-8s %(message)s")
|
|
19
|
+
handler.setFormatter(formatter)
|
|
20
|
+
|
|
21
|
+
# Only want to add the handler once
|
|
22
|
+
if "ingest_log_handler" not in [h.name for h in logger.handlers]:
|
|
23
|
+
logger.addHandler(handler)
|
|
24
|
+
|
|
25
|
+
remove_root_handlers(logger)
|
|
26
|
+
logger.setLevel(level)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def make_default_logger(level: int) -> logging.Logger:
|
|
30
|
+
"""Return a custom logger."""
|
|
31
|
+
logger = logging.getLogger("unstructured_ingest")
|
|
32
|
+
handler = logging.StreamHandler()
|
|
33
|
+
handler.name = "ingest_log_handler"
|
|
34
|
+
formatter = logging.Formatter("%(asctime)s %(processName)-10s %(levelname)-8s %(message)s")
|
|
35
|
+
handler.setFormatter(formatter)
|
|
36
|
+
logger.addHandler(handler)
|
|
37
|
+
logger.setLevel(level)
|
|
38
|
+
remove_root_handlers(logger)
|
|
39
|
+
return logger
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from typing import Callable, ClassVar, Optional, Protocol, Sequence
|
|
5
|
+
|
|
6
|
+
from opentelemetry import trace
|
|
7
|
+
from opentelemetry.context import attach, get_current
|
|
8
|
+
from opentelemetry.propagate import extract, inject
|
|
9
|
+
from opentelemetry.sdk.resources import SERVICE_NAME, Resource
|
|
10
|
+
from opentelemetry.sdk.trace import ReadableSpan, Tracer, TracerProvider
|
|
11
|
+
from opentelemetry.sdk.trace.export import (
|
|
12
|
+
ConsoleSpanExporter,
|
|
13
|
+
SimpleSpanProcessor,
|
|
14
|
+
SpanExportResult,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
from unstructured_ingest.logger import logger
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class AddTraceCallable(Protocol):
|
|
21
|
+
def __call__(self, provider: TracerProvider) -> None:
|
|
22
|
+
pass
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class LogSpanExporter(ConsoleSpanExporter):
|
|
26
|
+
def __init__(self, log_out: Callable = logger.info, **kwargs):
|
|
27
|
+
self.log_out = log_out
|
|
28
|
+
super().__init__(**kwargs)
|
|
29
|
+
|
|
30
|
+
def export(self, spans: Sequence[ReadableSpan]) -> SpanExportResult:
|
|
31
|
+
for span in spans:
|
|
32
|
+
self.log_out(self.formatter(span))
|
|
33
|
+
return SpanExportResult.SUCCESS
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def get_log_out() -> Callable:
|
|
37
|
+
level_names_mapping = {
|
|
38
|
+
"CRITICAL": logging.CRITICAL,
|
|
39
|
+
"FATAL": logging.FATAL,
|
|
40
|
+
"ERROR": logging.ERROR,
|
|
41
|
+
"WARN": logging.WARNING,
|
|
42
|
+
"WARNING": logging.WARNING,
|
|
43
|
+
"INFO": logging.INFO,
|
|
44
|
+
"DEBUG": logging.DEBUG,
|
|
45
|
+
"NOTSET": logging.NOTSET,
|
|
46
|
+
}
|
|
47
|
+
log_level = os.getenv("OTEL_LOG_LEVEL", "DEBUG").upper()
|
|
48
|
+
log_level_int = level_names_mapping.get(log_level, logging.DEBUG)
|
|
49
|
+
return lambda message: logger.log(log_level_int, message)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
@dataclass
|
|
53
|
+
class OtelHandler:
|
|
54
|
+
otel_endpoint: Optional[str] = None
|
|
55
|
+
service_name: str = "unstructured-ingest"
|
|
56
|
+
trace_provider: TracerProvider = field(init=False)
|
|
57
|
+
log_out: Callable = field(default=get_log_out())
|
|
58
|
+
trace_context_key: ClassVar[str] = "_trace_context"
|
|
59
|
+
|
|
60
|
+
def init_trace(self):
|
|
61
|
+
# Should only be done once
|
|
62
|
+
resource = Resource(attributes={SERVICE_NAME: self.service_name})
|
|
63
|
+
trace_provider = self.init_trace_provider(resource=resource)
|
|
64
|
+
trace.set_tracer_provider(trace_provider)
|
|
65
|
+
|
|
66
|
+
@staticmethod
|
|
67
|
+
def set_attributes(span, attributes_dict):
|
|
68
|
+
if attributes_dict:
|
|
69
|
+
for att in attributes_dict:
|
|
70
|
+
span.set_attribute(att, attributes_dict[att])
|
|
71
|
+
|
|
72
|
+
@staticmethod
|
|
73
|
+
def inject_context() -> dict:
|
|
74
|
+
trace_context = {}
|
|
75
|
+
current_context = get_current()
|
|
76
|
+
inject(trace_context, current_context)
|
|
77
|
+
return trace_context
|
|
78
|
+
|
|
79
|
+
@staticmethod
|
|
80
|
+
def attach_context(trace_context: dict) -> object:
|
|
81
|
+
extracted_context = extract(trace_context)
|
|
82
|
+
return attach(extracted_context)
|
|
83
|
+
|
|
84
|
+
def get_otel_endpoint(self) -> Optional[str]:
|
|
85
|
+
if otel_endpoint := self.otel_endpoint:
|
|
86
|
+
return otel_endpoint
|
|
87
|
+
if otlp_endpoint := os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT"):
|
|
88
|
+
return otlp_endpoint
|
|
89
|
+
if otlp_traces_endpoint := os.getenv("OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"):
|
|
90
|
+
return otlp_traces_endpoint
|
|
91
|
+
return None
|
|
92
|
+
|
|
93
|
+
def _add_console_trace_processor(self, provider: TracerProvider) -> None:
|
|
94
|
+
def custom_formatter(span: ReadableSpan) -> str:
|
|
95
|
+
duration = (span.end_time - span.start_time) / 1e9
|
|
96
|
+
s = f"{span.name} finished in {duration}s"
|
|
97
|
+
if span.attributes:
|
|
98
|
+
attributes_str = ", ".join([f"{k}={v}" for k, v in span.attributes.items()])
|
|
99
|
+
s += f", attributes: {attributes_str}"
|
|
100
|
+
return s
|
|
101
|
+
|
|
102
|
+
tracer_exporter = LogSpanExporter(formatter=custom_formatter, log_out=self.log_out)
|
|
103
|
+
processor = SimpleSpanProcessor(tracer_exporter)
|
|
104
|
+
provider.add_span_processor(span_processor=processor)
|
|
105
|
+
|
|
106
|
+
def _add_otel_trace_processor(self, provider: TracerProvider) -> None:
|
|
107
|
+
otel_endpoint = self.get_otel_endpoint()
|
|
108
|
+
if not otel_endpoint:
|
|
109
|
+
return None
|
|
110
|
+
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
|
|
111
|
+
|
|
112
|
+
logger.debug(f"adding otel exported at {otel_endpoint}")
|
|
113
|
+
trace_exporter = OTLPSpanExporter()
|
|
114
|
+
processor = SimpleSpanProcessor(trace_exporter)
|
|
115
|
+
provider.add_span_processor(processor)
|
|
116
|
+
|
|
117
|
+
def init_trace_provider(self, resource: Resource) -> TracerProvider:
|
|
118
|
+
trace_provider = TracerProvider(resource=resource)
|
|
119
|
+
add_fns: list[AddTraceCallable] = [
|
|
120
|
+
self._add_otel_trace_processor,
|
|
121
|
+
self._add_console_trace_processor,
|
|
122
|
+
]
|
|
123
|
+
for add_fn in add_fns:
|
|
124
|
+
add_fn(provider=trace_provider)
|
|
125
|
+
return trace_provider
|
|
126
|
+
|
|
127
|
+
def get_tracer(self) -> Tracer:
|
|
128
|
+
return trace.get_tracer(self.service_name)
|
|
File without changes
|
|
@@ -0,0 +1,211 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import logging
|
|
5
|
+
import multiprocessing as mp
|
|
6
|
+
import shutil
|
|
7
|
+
from abc import ABC, abstractmethod
|
|
8
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Any, Awaitable, Callable, Optional, TypeVar
|
|
12
|
+
|
|
13
|
+
from tqdm import tqdm
|
|
14
|
+
from tqdm.asyncio import tqdm as tqdm_asyncio
|
|
15
|
+
|
|
16
|
+
from unstructured_ingest.interfaces import BaseProcess, ProcessorConfig, Uploader
|
|
17
|
+
from unstructured_ingest.logger import logger, make_default_logger
|
|
18
|
+
from unstructured_ingest.otel import OtelHandler
|
|
19
|
+
from unstructured_ingest.pipeline.otel import instrument
|
|
20
|
+
|
|
21
|
+
BaseProcessT = TypeVar("BaseProcessT", bound=BaseProcess)
|
|
22
|
+
iterable_input = list[dict[str, Any]]
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataclass
|
|
26
|
+
class PipelineStep(ABC):
|
|
27
|
+
process: BaseProcessT
|
|
28
|
+
context: ProcessorConfig
|
|
29
|
+
identifier: str
|
|
30
|
+
|
|
31
|
+
def __str__(self):
|
|
32
|
+
return self.identifier
|
|
33
|
+
|
|
34
|
+
def process_serially(self, iterable: iterable_input) -> Any:
|
|
35
|
+
logger.info("processing content serially")
|
|
36
|
+
if iterable:
|
|
37
|
+
if len(iterable) == 1:
|
|
38
|
+
return [self.run(**iterable[0])]
|
|
39
|
+
if self.context.tqdm:
|
|
40
|
+
return [self.run(**it) for it in tqdm(iterable, desc=self.identifier)]
|
|
41
|
+
return [self.run(**it) for it in iterable]
|
|
42
|
+
return [self.run()]
|
|
43
|
+
|
|
44
|
+
async def _process_async(self, iterable: iterable_input) -> Any:
|
|
45
|
+
if iterable:
|
|
46
|
+
if len(iterable) == 1:
|
|
47
|
+
return [await self.run_async(**iterable[0])]
|
|
48
|
+
if self.context.tqdm:
|
|
49
|
+
return await tqdm_asyncio.gather(
|
|
50
|
+
*[self.run_async(**i) for i in iterable], desc=self.identifier
|
|
51
|
+
)
|
|
52
|
+
return await asyncio.gather(*[self.run_async(**i) for i in iterable])
|
|
53
|
+
return [await self.run_async()]
|
|
54
|
+
|
|
55
|
+
def process_async(self, iterable: iterable_input) -> Any:
|
|
56
|
+
logger.info("processing content async")
|
|
57
|
+
return self.asyncio_run(fn=self._process_async, iterable=iterable)
|
|
58
|
+
|
|
59
|
+
def asyncio_run(
|
|
60
|
+
self, fn: Callable[[Any, Any], Awaitable[Any]], *args: Any, **kwargs: Any
|
|
61
|
+
) -> Any:
|
|
62
|
+
current_loop = asyncio._get_running_loop()
|
|
63
|
+
if current_loop is None:
|
|
64
|
+
return asyncio.run(fn(*args, **kwargs))
|
|
65
|
+
with ThreadPoolExecutor(thread_name_prefix="asyncio") as thread_pool:
|
|
66
|
+
logger.warning(
|
|
67
|
+
f"async code being run in dedicated thread pool "
|
|
68
|
+
f"to not conflict with existing event loop: {current_loop}"
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
def wrapped():
|
|
72
|
+
return asyncio.run(fn(*args, **kwargs))
|
|
73
|
+
|
|
74
|
+
future = thread_pool.submit(wrapped)
|
|
75
|
+
return future.result()
|
|
76
|
+
|
|
77
|
+
def process_multiprocess(self, iterable: iterable_input) -> Any:
|
|
78
|
+
logger.info("processing content across processes")
|
|
79
|
+
|
|
80
|
+
if iterable:
|
|
81
|
+
if len(iterable) == 1:
|
|
82
|
+
return self.process_serially(iterable)
|
|
83
|
+
if self.context.num_processes == 1:
|
|
84
|
+
return self.process_serially(iterable)
|
|
85
|
+
with mp.Pool(
|
|
86
|
+
processes=self.context.num_processes,
|
|
87
|
+
initializer=self._init_mp,
|
|
88
|
+
initargs=(
|
|
89
|
+
logging.DEBUG if self.context.verbose else logging.INFO,
|
|
90
|
+
self.context.otel_endpoint,
|
|
91
|
+
),
|
|
92
|
+
) as pool:
|
|
93
|
+
otel_context = OtelHandler.inject_context()
|
|
94
|
+
for iter in iterable:
|
|
95
|
+
iter[OtelHandler.trace_context_key] = otel_context
|
|
96
|
+
if self.context.tqdm:
|
|
97
|
+
return list(
|
|
98
|
+
tqdm(
|
|
99
|
+
pool.imap_unordered(func=self._wrap_mp, iterable=iterable),
|
|
100
|
+
total=len(iterable),
|
|
101
|
+
desc=self.identifier,
|
|
102
|
+
)
|
|
103
|
+
)
|
|
104
|
+
return pool.map(self._wrap_mp, iterable)
|
|
105
|
+
return [self.run()]
|
|
106
|
+
|
|
107
|
+
def _wrap_mp(self, input_kwargs: dict) -> Any:
|
|
108
|
+
# Allow mapping of kwargs via multiprocessing map()
|
|
109
|
+
return self.run(**input_kwargs)
|
|
110
|
+
|
|
111
|
+
def _init_mp(self, log_level: int, endpoint: Optional[str] = None) -> None:
|
|
112
|
+
# Init logger for each spawned process when using multiprocessing pool
|
|
113
|
+
make_default_logger(level=log_level)
|
|
114
|
+
otel_handler = OtelHandler(otel_endpoint=endpoint, log_out=logger.debug)
|
|
115
|
+
otel_handler.init_trace()
|
|
116
|
+
|
|
117
|
+
@instrument()
|
|
118
|
+
def __call__(self, iterable: Optional[iterable_input] = None) -> Any:
|
|
119
|
+
iterable = iterable or []
|
|
120
|
+
if iterable:
|
|
121
|
+
logger.info(
|
|
122
|
+
f"calling {self.__class__.__name__} with {len(iterable)} docs", # type: ignore
|
|
123
|
+
)
|
|
124
|
+
else:
|
|
125
|
+
logger.info(f"calling {self.__class__.__name__} with no inputs")
|
|
126
|
+
if self.context.async_supported and self.process.is_async():
|
|
127
|
+
return self.process_async(iterable=iterable)
|
|
128
|
+
if self.context.mp_supported:
|
|
129
|
+
return self.process_multiprocess(iterable=iterable)
|
|
130
|
+
return self.process_serially(iterable=iterable)
|
|
131
|
+
|
|
132
|
+
def _run(self, fn: Callable, **kwargs: Any) -> Optional[Any]:
|
|
133
|
+
return self.asyncio_run(fn=self.run_async, _fn=fn, **kwargs)
|
|
134
|
+
|
|
135
|
+
async def _run_async(self, fn: Callable, **kwargs: Any) -> Optional[Any]:
|
|
136
|
+
raise NotImplementedError
|
|
137
|
+
|
|
138
|
+
def run(self, _fn: Callable[..., Any] | None = None, **kwargs: Any) -> Optional[Any]:
|
|
139
|
+
kwargs = kwargs.copy()
|
|
140
|
+
otel_handler = OtelHandler(otel_endpoint=self.context.otel_endpoint, log_out=logger.debug)
|
|
141
|
+
tracer = otel_handler.get_tracer()
|
|
142
|
+
if trace_context := kwargs.pop(otel_handler.trace_context_key, {}):
|
|
143
|
+
otel_handler.attach_context(trace_context=trace_context)
|
|
144
|
+
attributes = {}
|
|
145
|
+
if file_data_path := kwargs.get("file_data_path"):
|
|
146
|
+
attributes["file_id"] = Path(file_data_path).stem
|
|
147
|
+
try:
|
|
148
|
+
with tracer.start_as_current_span(self.identifier, record_exception=True) as span:
|
|
149
|
+
otel_handler.set_attributes(span, attributes)
|
|
150
|
+
fn = _fn or self.process.run
|
|
151
|
+
return self._run(fn=fn, **kwargs)
|
|
152
|
+
except Exception as e:
|
|
153
|
+
logger.error(f"Exception raised while running {self.identifier}", exc_info=e)
|
|
154
|
+
if "file_data_path" in kwargs:
|
|
155
|
+
self.context.status[kwargs["file_data_path"]] = {self.identifier: str(e)}
|
|
156
|
+
if self.context.raise_on_error:
|
|
157
|
+
raise e
|
|
158
|
+
return None
|
|
159
|
+
|
|
160
|
+
async def run_async(self, _fn: Optional[Callable] = None, **kwargs: Any) -> Optional[Any]:
|
|
161
|
+
otel_handler = OtelHandler(otel_endpoint=self.context.otel_endpoint, log_out=logger.debug)
|
|
162
|
+
try:
|
|
163
|
+
attributes = {}
|
|
164
|
+
if file_data_path := kwargs.get("file_data_path"):
|
|
165
|
+
attributes["file_id"] = Path(file_data_path).stem
|
|
166
|
+
with otel_handler.get_tracer().start_as_current_span(
|
|
167
|
+
self.identifier, record_exception=True
|
|
168
|
+
) as span:
|
|
169
|
+
otel_handler.set_attributes(span, attributes)
|
|
170
|
+
fn = _fn or self.process.run_async
|
|
171
|
+
return await self._run_async(fn=fn, **kwargs)
|
|
172
|
+
except Exception as e:
|
|
173
|
+
logger.error(f"Exception raised while running {self.identifier}", exc_info=e)
|
|
174
|
+
if "file_data_path" in kwargs:
|
|
175
|
+
self.context.status[kwargs["file_data_path"]] = {self.identifier: str(e)}
|
|
176
|
+
if self.context.raise_on_error:
|
|
177
|
+
raise e
|
|
178
|
+
return None
|
|
179
|
+
|
|
180
|
+
@property
|
|
181
|
+
def cache_dir(self) -> Path:
|
|
182
|
+
return Path(self.context.work_dir) / self.identifier
|
|
183
|
+
|
|
184
|
+
def delete_cache(self):
|
|
185
|
+
if self.context.iter_delete and self.cache_dir.exists():
|
|
186
|
+
cache_dir = self.cache_dir
|
|
187
|
+
logger.info(f"deleting {self.identifier} cache dir {cache_dir}")
|
|
188
|
+
shutil.rmtree(cache_dir)
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
@dataclass
|
|
192
|
+
class BatchPipelineStep(PipelineStep, ABC):
|
|
193
|
+
process: Uploader
|
|
194
|
+
|
|
195
|
+
def __call__(self, iterable: Optional[iterable_input] = None) -> Any:
|
|
196
|
+
if self.context.mp_supported and self.process.is_batch():
|
|
197
|
+
return self.run_batch(contents=iterable)
|
|
198
|
+
super().__call__(iterable=iterable)
|
|
199
|
+
|
|
200
|
+
@abstractmethod
|
|
201
|
+
def _run_batch(self, contents: iterable_input, **kwargs) -> Any:
|
|
202
|
+
pass
|
|
203
|
+
|
|
204
|
+
def run_batch(self, contents: iterable_input, **kwargs) -> Any:
|
|
205
|
+
try:
|
|
206
|
+
return self._run_batch(contents=contents, **kwargs)
|
|
207
|
+
except Exception as e:
|
|
208
|
+
self.context.status[self.identifier] = {"step_error": str(e)}
|
|
209
|
+
if self.context.raise_on_error:
|
|
210
|
+
raise e
|
|
211
|
+
return None
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
from functools import wraps
|
|
2
|
+
from typing import Callable, Optional
|
|
3
|
+
|
|
4
|
+
from unstructured_ingest.logger import logger
|
|
5
|
+
from unstructured_ingest.otel import OtelHandler
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def instrument(
|
|
9
|
+
span_name: Optional[str] = None,
|
|
10
|
+
record_exception: bool = True,
|
|
11
|
+
attributes: dict[str, str] = None,
|
|
12
|
+
log_out: Callable = logger.info,
|
|
13
|
+
) -> Callable[[Callable], Callable]:
|
|
14
|
+
def span_decorator(func: Callable) -> Callable:
|
|
15
|
+
def get_name(self) -> str:
|
|
16
|
+
if span_name:
|
|
17
|
+
return span_name
|
|
18
|
+
return f"{self.identifier} step"
|
|
19
|
+
|
|
20
|
+
@wraps(func)
|
|
21
|
+
def wrap_with_span(self, *args, **kwargs):
|
|
22
|
+
name = get_name(self=self)
|
|
23
|
+
otel_handler = OtelHandler(otel_endpoint=self.context.otel_endpoint, log_out=log_out)
|
|
24
|
+
with otel_handler.get_tracer().start_as_current_span(
|
|
25
|
+
name, record_exception=record_exception
|
|
26
|
+
) as span:
|
|
27
|
+
otel_handler.set_attributes(span, attributes)
|
|
28
|
+
return func(self, *args, **kwargs)
|
|
29
|
+
|
|
30
|
+
return wrap_with_span
|
|
31
|
+
|
|
32
|
+
return span_decorator
|