unstructured-ingest 1.0.6.dev0__py3-none-any.whl → 1.0.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

@@ -1 +1 @@
1
- __version__ = "1.0.6-dev0" # pragma: no cover
1
+ __version__ = "1.0.8" # pragma: no cover
@@ -0,0 +1,17 @@
1
+ from pydantic import BaseModel, Field
2
+
3
+
4
+ class Entity(BaseModel):
5
+ type: str
6
+ entity: str
7
+
8
+
9
+ class EntityRelationship(BaseModel):
10
+ to: str
11
+ from_: str = Field(..., alias="from")
12
+ relationship: str
13
+
14
+
15
+ class EntitiesData(BaseModel):
16
+ items: list[Entity] = Field(default_factory=list)
17
+ relationships: list[EntityRelationship] = Field(default_factory=list)
@@ -9,8 +9,9 @@ from enum import Enum
9
9
  from pathlib import Path
10
10
  from typing import TYPE_CHECKING, Any, AsyncGenerator, Literal, Optional
11
11
 
12
- from pydantic import BaseModel, ConfigDict, Field, Secret, field_validator
12
+ from pydantic import BaseModel, ConfigDict, Field, Secret, ValidationError, field_validator
13
13
 
14
+ from unstructured_ingest.data_types.entities import EntitiesData, Entity, EntityRelationship
14
15
  from unstructured_ingest.data_types.file_data import FileData
15
16
  from unstructured_ingest.error import DestinationConnectionError
16
17
  from unstructured_ingest.interfaces import (
@@ -97,7 +98,6 @@ class Neo4jUploadStager(UploadStager):
97
98
  **kwargs: Any,
98
99
  ) -> Path:
99
100
  elements = get_json_data(elements_filepath)
100
-
101
101
  nx_graph = self._create_lexical_graph(
102
102
  elements, self._create_document_node(file_data=file_data)
103
103
  )
@@ -109,28 +109,54 @@ class Neo4jUploadStager(UploadStager):
109
109
 
110
110
  return output_filepath
111
111
 
112
- def _add_entities(self, element: dict, graph: "Graph", element_node: _Node) -> None:
113
- entities = element.get("metadata", {}).get("entities", [])
114
- if not entities:
115
- return None
116
- if not isinstance(entities, list):
117
- return None
118
-
112
+ def _add_entities(self, entities: list[Entity], graph: "Graph", element_node: _Node) -> None:
119
113
  for entity in entities:
120
- if not isinstance(entity, dict):
121
- continue
122
- if "entity" not in entity or "type" not in entity:
123
- continue
124
114
  entity_node = _Node(
125
- labels=[Label.ENTITY], properties={"id": entity["entity"]}, id_=entity["entity"]
115
+ labels=[Label.ENTITY], properties={"id": entity.entity}, id_=entity.entity
126
116
  )
127
117
  graph.add_edge(
128
118
  entity_node,
129
- _Node(labels=[Label.ENTITY], properties={"id": entity["type"]}, id_=entity["type"]),
119
+ _Node(labels=[Label.ENTITY], properties={"id": entity.type}, id_=entity.type),
130
120
  relationship=Relationship.ENTITY_TYPE,
131
121
  )
132
122
  graph.add_edge(element_node, entity_node, relationship=Relationship.HAS_ENTITY)
133
123
 
124
+ def _add_entity_relationships(
125
+ self, relationships: list[EntityRelationship], graph: "Graph"
126
+ ) -> None:
127
+ for relationship in relationships:
128
+ from_node = _Node(
129
+ labels=[Label.ENTITY],
130
+ properties={"id": relationship.from_},
131
+ id_=relationship.from_,
132
+ )
133
+ to_node = _Node(
134
+ labels=[Label.ENTITY], properties={"id": relationship.to}, id_=relationship.to
135
+ )
136
+ graph.add_edge(from_node, to_node, relationship=relationship.relationship)
137
+
138
+ def _add_entity_data(self, element: dict, graph: "Graph", element_node: _Node) -> None:
139
+ entities = element.get("metadata", {}).get("entities", {})
140
+ if not entities:
141
+ return None
142
+ try:
143
+ if isinstance(entities, list):
144
+ self._add_entities(
145
+ [Entity.model_validate(e) for e in entities if isinstance(e, dict)],
146
+ graph,
147
+ element_node,
148
+ )
149
+ elif isinstance(entities, dict):
150
+ entity_data = EntitiesData.model_validate(entities)
151
+ self._add_entities(entity_data.items, graph, element_node)
152
+ self._add_entity_relationships(entity_data.relationships, graph)
153
+ except ValidationError:
154
+ logger.warning(
155
+ "Failed to add entities to the graph. "
156
+ "Please check the format of the entities in the input data."
157
+ )
158
+ return None
159
+
134
160
  def _create_lexical_graph(self, elements: list[dict], document_node: _Node) -> "Graph":
135
161
  import networkx as nx
136
162
 
@@ -149,7 +175,7 @@ class Neo4jUploadStager(UploadStager):
149
175
  previous_node = element_node
150
176
  graph.add_edge(element_node, document_node, relationship=Relationship.PART_OF_DOCUMENT)
151
177
 
152
- self._add_entities(element, graph, element_node)
178
+ self._add_entity_data(element, graph, element_node)
153
179
 
154
180
  if self._is_chunk(element):
155
181
  for origin_element in format_and_truncate_orig_elements(element, include_text=True):
@@ -165,7 +191,7 @@ class Neo4jUploadStager(UploadStager):
165
191
  document_node,
166
192
  relationship=Relationship.PART_OF_DOCUMENT,
167
193
  )
168
- self._add_entities(origin_element, graph, origin_element_node)
194
+ self._add_entity_data(origin_element, graph, origin_element_node)
169
195
 
170
196
  return graph
171
197
 
@@ -208,7 +234,9 @@ class _GraphData(BaseModel):
208
234
  _Edge(
209
235
  source=u,
210
236
  destination=v,
211
- relationship=Relationship(data_dict["relationship"]),
237
+ relationship=Relationship(data_dict["relationship"])
238
+ if data_dict["relationship"] in Relationship
239
+ else data_dict["relationship"],
212
240
  )
213
241
  for u, v, data_dict in nx_graph.edges(data=True)
214
242
  ]
@@ -242,7 +270,7 @@ class _Edge(BaseModel):
242
270
 
243
271
  source: _Node
244
272
  destination: _Node
245
- relationship: Relationship
273
+ relationship: Relationship | str
246
274
 
247
275
 
248
276
  class Label(Enum):
@@ -380,7 +408,7 @@ class Neo4jUploader(Uploader):
380
408
  )
381
409
  logger.info(f"Finished merging {len(graph_data.nodes)} graph nodes.")
382
410
 
383
- edges_by_relationship: defaultdict[tuple[Relationship, Label, Label], list[_Edge]] = (
411
+ edges_by_relationship: defaultdict[tuple[Relationship | str, Label, Label], list[_Edge]] = (
384
412
  defaultdict(list)
385
413
  )
386
414
  for edge in graph_data.edges:
@@ -463,16 +491,19 @@ class Neo4jUploader(Uploader):
463
491
  @staticmethod
464
492
  def _create_edges_query(
465
493
  edges: list[_Edge],
466
- relationship: Relationship,
494
+ relationship: Relationship | str,
467
495
  source_label: Label,
468
496
  destination_label: Label,
469
497
  ) -> tuple[str, dict]:
470
498
  logger.info(f"Preparing MERGE query for {len(edges)} {relationship} relationships.")
499
+ relationship = (
500
+ relationship.value if isinstance(relationship, Relationship) else relationship
501
+ )
471
502
  query_string = f"""
472
503
  UNWIND $edges AS edge
473
504
  MATCH (u: `{source_label.value}` {{id: edge.source}})
474
505
  MATCH (v: `{destination_label.value}` {{id: edge.destination}})
475
- MERGE (u)-[:`{relationship.value}`]->(v)
506
+ MERGE (u)-[:`{relationship}`]->(v)
476
507
  """
477
508
  parameters = {
478
509
  "edges": [
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: unstructured_ingest
3
- Version: 1.0.6.dev0
3
+ Version: 1.0.8
4
4
  Summary: Local ETL data pipeline to get data RAG ready
5
5
  Author-email: Unstructured Technologies <devops@unstructuredai.io>
6
6
  License-Expression: Apache-2.0
@@ -1,5 +1,5 @@
1
1
  unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
2
- unstructured_ingest/__version__.py,sha256=f9THPBYTYgzAGHqOPz3K_-VIVaQAVKMJgCpBy7w0j_k,47
2
+ unstructured_ingest/__version__.py,sha256=Ca4MzLfEjKrGXx21Kyt3Ve65pu59qVvEIU_io-qxQ9o,42
3
3
  unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
4
4
  unstructured_ingest/errors_v2.py,sha256=9RuRCi7lbDxCguDz07y5RiHoQiFIOWwOD7xqzJ2B3Yw,436
5
5
  unstructured_ingest/logger.py,sha256=7e_7UeK6hVOd5BQ6i9NzRUAPCS_DF839Y8TjUDywraY,1428
@@ -19,6 +19,7 @@ unstructured_ingest/cli/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5N
19
19
  unstructured_ingest/cli/utils/click.py,sha256=1_eJgrwS2DFBl1jZPLsj1vgVgR7agFBIEBe4A_n7mH4,7827
20
20
  unstructured_ingest/cli/utils/model_conversion.py,sha256=hMjAfOVvO1RXTDsw26mmersdncvddkb_rP9JTEgVVCw,7649
21
21
  unstructured_ingest/data_types/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
22
+ unstructured_ingest/data_types/entities.py,sha256=ECc6EkZ5_ZUvK7uaALYOynfFmofIrHYIJZfb67hUIxA,371
22
23
  unstructured_ingest/data_types/file_data.py,sha256=J0RQa7YXhhxiLVzhPbF5Hl2nzSpxLFK9vrP6RTBWlSg,3833
23
24
  unstructured_ingest/embed/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
24
25
  unstructured_ingest/embed/azure_openai.py,sha256=_-I-nwd-wdCiKkSdYBL4UKrTZ2UPWsM_0T69fcObs_I,1707
@@ -77,7 +78,7 @@ unstructured_ingest/processes/connectors/kdbai.py,sha256=XhxYpKSAoFPBsDQWwNuLX03
77
78
  unstructured_ingest/processes/connectors/local.py,sha256=LluTLKv4g7FbJb4A6vuSxI9VhzKZuuQUpDS-cVNAQ2g,7426
78
79
  unstructured_ingest/processes/connectors/milvus.py,sha256=Jr9cul7By03tGAPFnFBoqncnNWwbhKd-qbmkuqnin8U,8908
79
80
  unstructured_ingest/processes/connectors/mongodb.py,sha256=1g_5bfbS6lah3nsOXqLAanR3zNYJ47_Njw_uV-uj3_U,14324
80
- unstructured_ingest/processes/connectors/neo4j.py,sha256=eAM2XWSLA5caKJmbcd7ctn2TapreIJEXRoHoxT1OZwA,18718
81
+ unstructured_ingest/processes/connectors/neo4j.py,sha256=ztxvI9KY8RF5kYUuMGSzzN5mz7Fu_4Ai9P7dqCpJLc0,20267
81
82
  unstructured_ingest/processes/connectors/onedrive.py,sha256=VBkKlbJgR7uKlKTnjNybAw6ZawLKflDPpy2uVvgWYWw,19296
82
83
  unstructured_ingest/processes/connectors/outlook.py,sha256=zHM5frO7CqQG0-KcTyX49aZeSlsvVrl8kh_lR_ESgQw,9275
83
84
  unstructured_ingest/processes/connectors/pinecone.py,sha256=BdO1PS_Y6FOeL-7uPl-Eh6ij1wHOwMkopOzKQGQ9Ac0,13979
@@ -230,8 +231,8 @@ unstructured_ingest/utils/ndjson.py,sha256=nz8VUOPEgAFdhaDOpuveknvCU4x82fVwqE01q
230
231
  unstructured_ingest/utils/pydantic_models.py,sha256=BT_j15e4rX40wQbt8LUXbqfPhA3rJn1PHTI_G_A_EHY,1720
231
232
  unstructured_ingest/utils/string_and_date_utils.py,sha256=oXOI6rxXq-8ncbk7EoJK0WCcTXWj75EzKl8pfQMID3U,2522
232
233
  unstructured_ingest/utils/table.py,sha256=WZechczgVFvlodUWFcsnCGvBNh1xRm6hr0VbJTPxKAc,3669
233
- unstructured_ingest-1.0.6.dev0.dist-info/METADATA,sha256=2A8V0IxykBKTDpalgDOmkRw98MoN6whUPI0DwHkBuBc,8724
234
- unstructured_ingest-1.0.6.dev0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
235
- unstructured_ingest-1.0.6.dev0.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
236
- unstructured_ingest-1.0.6.dev0.dist-info/licenses/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
237
- unstructured_ingest-1.0.6.dev0.dist-info/RECORD,,
234
+ unstructured_ingest-1.0.8.dist-info/METADATA,sha256=IzN7b_dpadQBKpp59jO7VfWzgQfJrF8ykGLo7epNMeY,8719
235
+ unstructured_ingest-1.0.8.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
236
+ unstructured_ingest-1.0.8.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
237
+ unstructured_ingest-1.0.8.dist-info/licenses/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
238
+ unstructured_ingest-1.0.8.dist-info/RECORD,,