cwyodmodules 0.3.44__py3-none-any.whl → 0.3.46__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cwyodmodules/__init__.py +0 -0
- cwyodmodules/batch/utilities/helpers/config/config_helper.py +0 -9
- cwyodmodules/batch/utilities/helpers/config/default.json +147 -148
- cwyodmodules/batch/utilities/helpers/secret_helper.py +79 -80
- cwyodmodules/graphrag/indexing/extraction.py +237 -230
- cwyodmodules/graphrag/main.py +34 -34
- cwyodmodules/graphrag/query/generate.py +106 -91
- {cwyodmodules-0.3.44.dist-info → cwyodmodules-0.3.46.dist-info}/METADATA +1 -1
- {cwyodmodules-0.3.44.dist-info → cwyodmodules-0.3.46.dist-info}/RECORD +12 -11
- {cwyodmodules-0.3.44.dist-info → cwyodmodules-0.3.46.dist-info}/WHEEL +0 -0
- {cwyodmodules-0.3.44.dist-info → cwyodmodules-0.3.46.dist-info}/licenses/LICENSE +0 -0
- {cwyodmodules-0.3.44.dist-info → cwyodmodules-0.3.46.dist-info}/top_level.txt +0 -0
@@ -1,230 +1,237 @@
|
|
1
|
-
from
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
from
|
12
|
-
from
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
from
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
)
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
1
|
+
from .types import (
|
2
|
+
EntityExtractionResult,
|
3
|
+
RelationshipExtractionResult,
|
4
|
+
Chunk,
|
5
|
+
EntityModel,
|
6
|
+
RelationshipModel,
|
7
|
+
KeptVsMerged,
|
8
|
+
ChunkModel,
|
9
|
+
HighLevelKeywords
|
10
|
+
)
|
11
|
+
from ..llm.entity_extraction import extract_entities as llm_extract_entities
|
12
|
+
from typing import List, Dict, Tuple
|
13
|
+
import pandas as pd
|
14
|
+
|
15
|
+
from graphrag.database.base import get_db
|
16
|
+
from graphrag.database.models import Chunk, Entity, Relationship
|
17
|
+
|
18
|
+
from graphrag.indexing.utils import calculate_hash
|
19
|
+
from graphrag.llm.llm import extract_entities_completion
|
20
|
+
|
21
|
+
from typing import (List,
|
22
|
+
Tuple,
|
23
|
+
Dict,
|
24
|
+
Any,
|
25
|
+
Set)
|
26
|
+
from fuzzywuzzy import fuzz
|
27
|
+
from openai import RateLimitError
|
28
|
+
|
29
|
+
import uuid
|
30
|
+
import asyncio
|
31
|
+
import uuid
|
32
|
+
|
33
|
+
|
34
|
+
def _extract_chunk_info_from_db_if_exists(chunk: str) -> Tuple[List[EntityModel], List[RelationshipModel], ChunkModel] | None:
|
35
|
+
|
36
|
+
db = next(get_db())
|
37
|
+
hash = "chunk-" + calculate_hash(text=chunk)
|
38
|
+
exists = db.query(Chunk).filter(Chunk.hash == hash).first()
|
39
|
+
if exists:
|
40
|
+
entities_models, relationships_models = [], []
|
41
|
+
entities = db.query(Entity).filter(Entity.chunk_id == exists.chunk_id).all()
|
42
|
+
for entity in entities:
|
43
|
+
entity_model = EntityModel(
|
44
|
+
entity_name=entity.entity_name,
|
45
|
+
entity_type=entity.entity_type,
|
46
|
+
entity_description=entity.description,
|
47
|
+
chunk_id=str(exists.chunk_id)
|
48
|
+
)
|
49
|
+
entities_models.append(entity_model)
|
50
|
+
relationships = db.query(Relationship).filter(Relationship.chunk_id == exists.chunk_id).all()
|
51
|
+
for rel in relationships:
|
52
|
+
rel_model = RelationshipModel(
|
53
|
+
source_entity=rel.source_entity.entity_name,
|
54
|
+
target_entity=rel.target_entity.entity_name,
|
55
|
+
relationship_description=rel.description,
|
56
|
+
relationship_strength=rel.weight,
|
57
|
+
chunk_id=str(exists.chunk_id),
|
58
|
+
relationship_keywords=rel.keywords
|
59
|
+
)
|
60
|
+
relationships_models.append(rel_model)
|
61
|
+
db.close()
|
62
|
+
return entities_models, relationships_models, ChunkModel(text=exists.text, id=str(exists.chunk_id))
|
63
|
+
db.close()
|
64
|
+
return None
|
65
|
+
|
66
|
+
|
67
|
+
def _merge_entities(entities: List[EntityModel], threshold: int=75) -> Tuple[List[EntityModel], Dict[str, Set[str]]]:
|
68
|
+
|
69
|
+
def find_most_similar(entity: EntityModel, candidates: List[EntityModel], threshold: int) -> List[EntityModel]:
|
70
|
+
|
71
|
+
most_sim_entity: List[EntityModel] = []
|
72
|
+
for candidate_entity in candidates:
|
73
|
+
if (entity.entity_type != candidate_entity.entity_type): continue
|
74
|
+
try:
|
75
|
+
score = fuzz.ratio(entity.entity_name, candidate_entity.entity_name)
|
76
|
+
except IndexError:
|
77
|
+
continue
|
78
|
+
if score > threshold:
|
79
|
+
most_sim_entity.append(candidate_entity)
|
80
|
+
|
81
|
+
return most_sim_entity
|
82
|
+
|
83
|
+
kept_vs_merged = {}
|
84
|
+
|
85
|
+
merged_entities = set()
|
86
|
+
modified_entities: Dict[Tuple[str, str, str, str], List[EntityModel]] = {}
|
87
|
+
for index, entity in enumerate(entities):
|
88
|
+
most_sim_entities = find_most_similar(entity=entity, candidates=entities[index+1:], threshold=threshold)
|
89
|
+
entity_key = (entity.entity_name, entity.entity_type, entity.entity_description)
|
90
|
+
|
91
|
+
if entity_key in merged_entities:
|
92
|
+
print(f"{entity_key} already exists as an entity")
|
93
|
+
continue
|
94
|
+
|
95
|
+
if (entity.entity_name, entity.entity_type, entity.entity_description, entity.get_chunk_id) not in modified_entities:
|
96
|
+
modified_entities[(entity.entity_name, entity.entity_type, entity.entity_description, entity.get_chunk_id)] = []
|
97
|
+
if most_sim_entities:
|
98
|
+
for most_sim_entity in most_sim_entities:
|
99
|
+
most_sim_key = (most_sim_entity.entity_name, most_sim_entity.entity_type, most_sim_entity.entity_description)
|
100
|
+
print(f"{most_sim_key[:2]} has been identified as similar to another entity")
|
101
|
+
merged_entities.add(most_sim_key)
|
102
|
+
modified_entities[(entity.entity_name, entity.entity_type, entity.entity_description, entity.get_chunk_id)].append(most_sim_entity)
|
103
|
+
if most_sim_entity.entity_name not in kept_vs_merged:
|
104
|
+
kept_vs_merged[most_sim_entity.entity_name] = {entity.entity_name}
|
105
|
+
else:
|
106
|
+
kept_vs_merged[most_sim_entity.entity_name].add(entity.entity_name)
|
107
|
+
|
108
|
+
updated_entities = []
|
109
|
+
for entity_info, sim_entities in modified_entities.items():
|
110
|
+
|
111
|
+
if entity_info[:-1] in merged_entities and not len(sim_entities): continue
|
112
|
+
|
113
|
+
updated_entities.append(
|
114
|
+
EntityModel(
|
115
|
+
entity_name=entity_info[0],
|
116
|
+
entity_type=entity_info[1],
|
117
|
+
entity_description=entity_info[2] + "\n".join([sim_entity.entity_description for sim_entity in sim_entities]),
|
118
|
+
chunk_id=set([entity_info[3]] + [sim_entity.get_chunk_id for sim_entity in sim_entities])
|
119
|
+
)
|
120
|
+
)
|
121
|
+
|
122
|
+
return updated_entities, kept_vs_merged
|
123
|
+
|
124
|
+
|
125
|
+
def _merge_relationships(
|
126
|
+
relationships: List[RelationshipModel], kept_vs_merged_entities: Dict[str, List[str]]
|
127
|
+
) -> List[RelationshipModel]:
|
128
|
+
|
129
|
+
for relationship in relationships:
|
130
|
+
source, target = relationship.source_entity, relationship.target_entity
|
131
|
+
try:
|
132
|
+
if source in kept_vs_merged_entities:
|
133
|
+
relationship.source_entity = list(kept_vs_merged_entities[source])[0]
|
134
|
+
if target in kept_vs_merged_entities:
|
135
|
+
relationship.target_entity = list(kept_vs_merged_entities[target])[0]
|
136
|
+
except (KeyError, IndexError):
|
137
|
+
print(f"Something went wrong for edge: {(source, target)}")
|
138
|
+
continue
|
139
|
+
|
140
|
+
|
141
|
+
merged_relationships = {}
|
142
|
+
for relationship in relationships:
|
143
|
+
edge = (relationship.source_entity, relationship.target_entity)
|
144
|
+
if edge not in merged_relationships:
|
145
|
+
merged_relationships[edge] = relationship
|
146
|
+
continue
|
147
|
+
print(f"Edge: ({source}, {target}) already exists")
|
148
|
+
existing_edge = merged_relationships[edge]
|
149
|
+
existing_edge.relationship_description += "\n" + relationship.relationship_description
|
150
|
+
existing_edge.relationship_strength += relationship.relationship_strength
|
151
|
+
existing_edge.relationship_keywords += relationship.relationship_keywords
|
152
|
+
existing_edge.relationship_keywords = list(set(existing_edge.relationship_keywords))
|
153
|
+
existing_edge.update_chunk_ids(relationship.chunk_id)
|
154
|
+
|
155
|
+
return list(merged_relationships.values())
|
156
|
+
|
157
|
+
|
158
|
+
async def _extract_graph_information_from_chunk(chunk: str, entity_types: Dict[str, str], gleaning: int=1) -> Tuple[List[EntityModel], List[RelationshipModel], ChunkModel] | None:
|
159
|
+
|
160
|
+
already_exists = _extract_chunk_info_from_db_if_exists(chunk=chunk)
|
161
|
+
if already_exists is not None:
|
162
|
+
return already_exists
|
163
|
+
chunk_info: Dict[str, Any] = {}
|
164
|
+
for _ in range(gleaning):
|
165
|
+
gleaning_chunk_info = await extract_entities_completion(chunk=chunk,
|
166
|
+
history=None,
|
167
|
+
entity_types=entity_types)
|
168
|
+
if gleaning_chunk_info is None: continue
|
169
|
+
|
170
|
+
more_chunk_info = await extract_entities_completion(
|
171
|
+
chunk=chunk, history=str(chunk_info), entity_types=entity_types
|
172
|
+
)
|
173
|
+
if more_chunk_info is not None:
|
174
|
+
chunk_info.update(more_chunk_info)
|
175
|
+
|
176
|
+
chunk_model = ChunkModel(text=chunk, id=str(uuid.uuid4()))
|
177
|
+
try:
|
178
|
+
entities, relationships, high_level_keywords = [chunk_info[key] for key in ("entities", "relationships", "content_keywords")]
|
179
|
+
except KeyError as e:
|
180
|
+
print(f"KeyError -> {e}")
|
181
|
+
print(chunk_info)
|
182
|
+
raise e
|
183
|
+
if isinstance(high_level_keywords, list): high_level_keywords = {"content_keywords": high_level_keywords}
|
184
|
+
entities_models, relationships_models, high_level_keywords_models = [
|
185
|
+
[model(**val, chunk_id={chunk_model.id}) for val in values] if isinstance(values, list) else [model(**values, chunk_id={chunk_model.id})]
|
186
|
+
for model, values in zip((EntityModel, RelationshipModel, HighLevelKeywords),
|
187
|
+
(entities, relationships, high_level_keywords))
|
188
|
+
]
|
189
|
+
return entities_models, relationships_models, chunk_model
|
190
|
+
|
191
|
+
|
192
|
+
async def extract_entities(chunks: List[str], entity_types: Dict[str, str], gleaning: int=1, batch: int=15) -> Tuple[List[EntityModel], List[RelationshipModel], Dict[str, Set[str]], List[ChunkModel]]:
|
193
|
+
|
194
|
+
if len(chunks) > batch:
|
195
|
+
results = []
|
196
|
+
for k in range(0, len(chunks), batch):
|
197
|
+
batch_chunks = chunks[k: k + batch]
|
198
|
+
try:
|
199
|
+
results.extend(
|
200
|
+
await asyncio.gather(*[
|
201
|
+
_extract_graph_information_from_chunk(chunk=chunk, gleaning=gleaning, entity_types=entity_types) for chunk in batch_chunks
|
202
|
+
])
|
203
|
+
)
|
204
|
+
except RateLimitError:
|
205
|
+
print("Rate limit error. Sleeping for a few seconds...")
|
206
|
+
await asyncio.sleep(2)
|
207
|
+
sub_batch = batch // 2
|
208
|
+
for j in range(0, len(batch_chunks), sub_batch):
|
209
|
+
results.extend(
|
210
|
+
await asyncio.gather(*[
|
211
|
+
_extract_graph_information_from_chunk(chunk=chunk, gleaning=gleaning, entity_types=entity_types) for chunk in batch_chunks[j: j + sub_batch]
|
212
|
+
])
|
213
|
+
)
|
214
|
+
await asyncio.sleep(1)
|
215
|
+
await asyncio.sleep(1)
|
216
|
+
else:
|
217
|
+
results = await asyncio.gather(*[
|
218
|
+
_extract_graph_information_from_chunk(chunk=chunk, gleaning=gleaning, entity_types=entity_types) for chunk in chunks
|
219
|
+
])
|
220
|
+
|
221
|
+
if results is None:
|
222
|
+
return None
|
223
|
+
|
224
|
+
entities, relationships, chunks_models = [], [], []
|
225
|
+
|
226
|
+
for result in results:
|
227
|
+
if result is None:
|
228
|
+
continue
|
229
|
+
entities_n, relationships_n, chunk = result
|
230
|
+
entities.extend(entities_n)
|
231
|
+
relationships.extend(relationships_n)
|
232
|
+
chunks_models.append(chunk)
|
233
|
+
|
234
|
+
entities, kept_vs_merged = _merge_entities(entities=entities)
|
235
|
+
relationships = _merge_relationships(relationships=relationships, kept_vs_merged_entities=kept_vs_merged)
|
236
|
+
|
237
|
+
return entities, relationships, kept_vs_merged, chunks_models
|
cwyodmodules/graphrag/main.py
CHANGED
@@ -1,34 +1,34 @@
|
|
1
|
-
from
|
2
|
-
upsert_data_and_create_graph,
|
3
|
-
extract_entities)
|
4
|
-
from
|
5
|
-
|
6
|
-
from
|
7
|
-
from typing import List, Dict, Tuple, Any, Set
|
8
|
-
|
9
|
-
import networkx as nx
|
10
|
-
|
11
|
-
|
12
|
-
async def insert(text: str, config: GlobalConfig) -> nx.Graph:
|
13
|
-
chunks = await create_chunks(text=text,
|
14
|
-
min_token_size=config.min_chunk_size,
|
15
|
-
max_token_size=config.max_chunk_size)
|
16
|
-
print(f"{len(chunks)} chunks created")
|
17
|
-
entities, relationships, kept_vs_merged, chunk_models = await extract_entities(chunks=chunks, entity_types=config.entity_types, gleaning=config.max_gleaning, batch=config.batch)
|
18
|
-
print(f"{len(entities)} entities extracted and {len(relationships)} relationships extracted. ")
|
19
|
-
graph = await upsert_data_and_create_graph(entities=entities, relationships=relationships, chunks=chunk_models)
|
20
|
-
return graph
|
21
|
-
|
22
|
-
async def local_query(query: str, config: GlobalConfig) -> Tuple[str | None, List[str], Dict[str, Dict[str, Any]], List[str]]:
|
23
|
-
response, chunk_texts, nodes, keywords = await _local_query(query=query, top_k=config.keywords_top_k, max_nodes=config.graph_top_k, order_range=config.order_range)
|
24
|
-
return response, chunk_texts, nodes, keywords
|
25
|
-
|
26
|
-
async def global_query(query: str, config: GlobalConfig) -> Tuple[str | None, List[str], Dict[str, Any], List[str]]:
|
27
|
-
response, chunk_texts, chunks, keywords = await _global_query(query=query, top_k=config.keywords_top_k, max_nodes=config.graph_top_k, alpha=config.alpha)
|
28
|
-
return response, chunk_texts, chunks, keywords
|
29
|
-
|
30
|
-
async def hybrid_query(query: str, config: GlobalConfig) -> Tuple[str | None, List[str], Tuple[Dict[str, Any], Dict[str, Dict[str, Any]]], Set[str]]:
|
31
|
-
return await _hybrid_query(query=query, top_k=config.keywords_top_k, max_nodes=config.graph_top_k, alpha=config.alpha, order_range=config.order_range)
|
32
|
-
|
33
|
-
async def naive_query(query: str, config: GlobalConfig) -> Tuple[str, List[str]]:
|
34
|
-
return await _naive_rag(query=query, top_k=config.graph_top_k)
|
1
|
+
from .indexing import (create_chunks,
|
2
|
+
upsert_data_and_create_graph,
|
3
|
+
extract_entities)
|
4
|
+
from .query.generate import _local_query, _global_query, _hybrid_query, _naive_rag
|
5
|
+
|
6
|
+
from .config import GlobalConfig
|
7
|
+
from typing import List, Dict, Tuple, Any, Set
|
8
|
+
|
9
|
+
import networkx as nx
|
10
|
+
|
11
|
+
|
12
|
+
async def insert(text: str, config: GlobalConfig) -> nx.Graph:
|
13
|
+
chunks = await create_chunks(text=text,
|
14
|
+
min_token_size=config.min_chunk_size,
|
15
|
+
max_token_size=config.max_chunk_size)
|
16
|
+
print(f"{len(chunks)} chunks created")
|
17
|
+
entities, relationships, kept_vs_merged, chunk_models = await extract_entities(chunks=chunks, entity_types=config.entity_types, gleaning=config.max_gleaning, batch=config.batch)
|
18
|
+
print(f"{len(entities)} entities extracted and {len(relationships)} relationships extracted. ")
|
19
|
+
graph = await upsert_data_and_create_graph(entities=entities, relationships=relationships, chunks=chunk_models)
|
20
|
+
return graph
|
21
|
+
|
22
|
+
async def local_query(query: str, config: GlobalConfig) -> Tuple[str | None, List[str], Dict[str, Dict[str, Any]], List[str]]:
|
23
|
+
response, chunk_texts, nodes, keywords = await _local_query(query=query, top_k=config.keywords_top_k, max_nodes=config.graph_top_k, order_range=config.order_range)
|
24
|
+
return response, chunk_texts, nodes, keywords
|
25
|
+
|
26
|
+
async def global_query(query: str, config: GlobalConfig) -> Tuple[str | None, List[str], Dict[str, Any], List[str]]:
|
27
|
+
response, chunk_texts, chunks, keywords = await _global_query(query=query, top_k=config.keywords_top_k, max_nodes=config.graph_top_k, alpha=config.alpha)
|
28
|
+
return response, chunk_texts, chunks, keywords
|
29
|
+
|
30
|
+
async def hybrid_query(query: str, config: GlobalConfig) -> Tuple[str | None, List[str], Tuple[Dict[str, Any], Dict[str, Dict[str, Any]]], Set[str]]:
|
31
|
+
return await _hybrid_query(query=query, top_k=config.keywords_top_k, max_nodes=config.graph_top_k, alpha=config.alpha, order_range=config.order_range)
|
32
|
+
|
33
|
+
async def naive_query(query: str, config: GlobalConfig) -> Tuple[str, List[str]]:
|
34
|
+
return await _naive_rag(query=query, top_k=config.graph_top_k)
|