aiecs 1.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (302) hide show
  1. aiecs/__init__.py +72 -0
  2. aiecs/__main__.py +41 -0
  3. aiecs/aiecs_client.py +469 -0
  4. aiecs/application/__init__.py +10 -0
  5. aiecs/application/executors/__init__.py +10 -0
  6. aiecs/application/executors/operation_executor.py +363 -0
  7. aiecs/application/knowledge_graph/__init__.py +7 -0
  8. aiecs/application/knowledge_graph/builder/__init__.py +37 -0
  9. aiecs/application/knowledge_graph/builder/document_builder.py +375 -0
  10. aiecs/application/knowledge_graph/builder/graph_builder.py +356 -0
  11. aiecs/application/knowledge_graph/builder/schema_mapping.py +531 -0
  12. aiecs/application/knowledge_graph/builder/structured_pipeline.py +443 -0
  13. aiecs/application/knowledge_graph/builder/text_chunker.py +319 -0
  14. aiecs/application/knowledge_graph/extractors/__init__.py +27 -0
  15. aiecs/application/knowledge_graph/extractors/base.py +100 -0
  16. aiecs/application/knowledge_graph/extractors/llm_entity_extractor.py +327 -0
  17. aiecs/application/knowledge_graph/extractors/llm_relation_extractor.py +349 -0
  18. aiecs/application/knowledge_graph/extractors/ner_entity_extractor.py +244 -0
  19. aiecs/application/knowledge_graph/fusion/__init__.py +23 -0
  20. aiecs/application/knowledge_graph/fusion/entity_deduplicator.py +387 -0
  21. aiecs/application/knowledge_graph/fusion/entity_linker.py +343 -0
  22. aiecs/application/knowledge_graph/fusion/knowledge_fusion.py +580 -0
  23. aiecs/application/knowledge_graph/fusion/relation_deduplicator.py +189 -0
  24. aiecs/application/knowledge_graph/pattern_matching/__init__.py +21 -0
  25. aiecs/application/knowledge_graph/pattern_matching/pattern_matcher.py +344 -0
  26. aiecs/application/knowledge_graph/pattern_matching/query_executor.py +378 -0
  27. aiecs/application/knowledge_graph/profiling/__init__.py +12 -0
  28. aiecs/application/knowledge_graph/profiling/query_plan_visualizer.py +199 -0
  29. aiecs/application/knowledge_graph/profiling/query_profiler.py +223 -0
  30. aiecs/application/knowledge_graph/reasoning/__init__.py +27 -0
  31. aiecs/application/knowledge_graph/reasoning/evidence_synthesis.py +347 -0
  32. aiecs/application/knowledge_graph/reasoning/inference_engine.py +504 -0
  33. aiecs/application/knowledge_graph/reasoning/logic_form_parser.py +167 -0
  34. aiecs/application/knowledge_graph/reasoning/logic_parser/__init__.py +79 -0
  35. aiecs/application/knowledge_graph/reasoning/logic_parser/ast_builder.py +513 -0
  36. aiecs/application/knowledge_graph/reasoning/logic_parser/ast_nodes.py +630 -0
  37. aiecs/application/knowledge_graph/reasoning/logic_parser/ast_validator.py +654 -0
  38. aiecs/application/knowledge_graph/reasoning/logic_parser/error_handler.py +477 -0
  39. aiecs/application/knowledge_graph/reasoning/logic_parser/parser.py +390 -0
  40. aiecs/application/knowledge_graph/reasoning/logic_parser/query_context.py +217 -0
  41. aiecs/application/knowledge_graph/reasoning/logic_query_integration.py +169 -0
  42. aiecs/application/knowledge_graph/reasoning/query_planner.py +872 -0
  43. aiecs/application/knowledge_graph/reasoning/reasoning_engine.py +554 -0
  44. aiecs/application/knowledge_graph/retrieval/__init__.py +19 -0
  45. aiecs/application/knowledge_graph/retrieval/retrieval_strategies.py +596 -0
  46. aiecs/application/knowledge_graph/search/__init__.py +59 -0
  47. aiecs/application/knowledge_graph/search/hybrid_search.py +423 -0
  48. aiecs/application/knowledge_graph/search/reranker.py +295 -0
  49. aiecs/application/knowledge_graph/search/reranker_strategies.py +553 -0
  50. aiecs/application/knowledge_graph/search/text_similarity.py +398 -0
  51. aiecs/application/knowledge_graph/traversal/__init__.py +15 -0
  52. aiecs/application/knowledge_graph/traversal/enhanced_traversal.py +329 -0
  53. aiecs/application/knowledge_graph/traversal/path_scorer.py +269 -0
  54. aiecs/application/knowledge_graph/validators/__init__.py +13 -0
  55. aiecs/application/knowledge_graph/validators/relation_validator.py +189 -0
  56. aiecs/application/knowledge_graph/visualization/__init__.py +11 -0
  57. aiecs/application/knowledge_graph/visualization/graph_visualizer.py +321 -0
  58. aiecs/common/__init__.py +9 -0
  59. aiecs/common/knowledge_graph/__init__.py +17 -0
  60. aiecs/common/knowledge_graph/runnable.py +484 -0
  61. aiecs/config/__init__.py +16 -0
  62. aiecs/config/config.py +498 -0
  63. aiecs/config/graph_config.py +137 -0
  64. aiecs/config/registry.py +23 -0
  65. aiecs/core/__init__.py +46 -0
  66. aiecs/core/interface/__init__.py +34 -0
  67. aiecs/core/interface/execution_interface.py +152 -0
  68. aiecs/core/interface/storage_interface.py +171 -0
  69. aiecs/domain/__init__.py +289 -0
  70. aiecs/domain/agent/__init__.py +189 -0
  71. aiecs/domain/agent/base_agent.py +697 -0
  72. aiecs/domain/agent/exceptions.py +103 -0
  73. aiecs/domain/agent/graph_aware_mixin.py +559 -0
  74. aiecs/domain/agent/hybrid_agent.py +490 -0
  75. aiecs/domain/agent/integration/__init__.py +26 -0
  76. aiecs/domain/agent/integration/context_compressor.py +222 -0
  77. aiecs/domain/agent/integration/context_engine_adapter.py +252 -0
  78. aiecs/domain/agent/integration/retry_policy.py +219 -0
  79. aiecs/domain/agent/integration/role_config.py +213 -0
  80. aiecs/domain/agent/knowledge_aware_agent.py +646 -0
  81. aiecs/domain/agent/lifecycle.py +296 -0
  82. aiecs/domain/agent/llm_agent.py +300 -0
  83. aiecs/domain/agent/memory/__init__.py +12 -0
  84. aiecs/domain/agent/memory/conversation.py +197 -0
  85. aiecs/domain/agent/migration/__init__.py +14 -0
  86. aiecs/domain/agent/migration/conversion.py +160 -0
  87. aiecs/domain/agent/migration/legacy_wrapper.py +90 -0
  88. aiecs/domain/agent/models.py +317 -0
  89. aiecs/domain/agent/observability.py +407 -0
  90. aiecs/domain/agent/persistence.py +289 -0
  91. aiecs/domain/agent/prompts/__init__.py +29 -0
  92. aiecs/domain/agent/prompts/builder.py +161 -0
  93. aiecs/domain/agent/prompts/formatters.py +189 -0
  94. aiecs/domain/agent/prompts/template.py +255 -0
  95. aiecs/domain/agent/registry.py +260 -0
  96. aiecs/domain/agent/tool_agent.py +257 -0
  97. aiecs/domain/agent/tools/__init__.py +12 -0
  98. aiecs/domain/agent/tools/schema_generator.py +221 -0
  99. aiecs/domain/community/__init__.py +155 -0
  100. aiecs/domain/community/agent_adapter.py +477 -0
  101. aiecs/domain/community/analytics.py +481 -0
  102. aiecs/domain/community/collaborative_workflow.py +642 -0
  103. aiecs/domain/community/communication_hub.py +645 -0
  104. aiecs/domain/community/community_builder.py +320 -0
  105. aiecs/domain/community/community_integration.py +800 -0
  106. aiecs/domain/community/community_manager.py +813 -0
  107. aiecs/domain/community/decision_engine.py +879 -0
  108. aiecs/domain/community/exceptions.py +225 -0
  109. aiecs/domain/community/models/__init__.py +33 -0
  110. aiecs/domain/community/models/community_models.py +268 -0
  111. aiecs/domain/community/resource_manager.py +457 -0
  112. aiecs/domain/community/shared_context_manager.py +603 -0
  113. aiecs/domain/context/__init__.py +58 -0
  114. aiecs/domain/context/context_engine.py +989 -0
  115. aiecs/domain/context/conversation_models.py +354 -0
  116. aiecs/domain/context/graph_memory.py +467 -0
  117. aiecs/domain/execution/__init__.py +12 -0
  118. aiecs/domain/execution/model.py +57 -0
  119. aiecs/domain/knowledge_graph/__init__.py +19 -0
  120. aiecs/domain/knowledge_graph/models/__init__.py +52 -0
  121. aiecs/domain/knowledge_graph/models/entity.py +130 -0
  122. aiecs/domain/knowledge_graph/models/evidence.py +194 -0
  123. aiecs/domain/knowledge_graph/models/inference_rule.py +186 -0
  124. aiecs/domain/knowledge_graph/models/path.py +179 -0
  125. aiecs/domain/knowledge_graph/models/path_pattern.py +173 -0
  126. aiecs/domain/knowledge_graph/models/query.py +272 -0
  127. aiecs/domain/knowledge_graph/models/query_plan.py +187 -0
  128. aiecs/domain/knowledge_graph/models/relation.py +136 -0
  129. aiecs/domain/knowledge_graph/schema/__init__.py +23 -0
  130. aiecs/domain/knowledge_graph/schema/entity_type.py +135 -0
  131. aiecs/domain/knowledge_graph/schema/graph_schema.py +271 -0
  132. aiecs/domain/knowledge_graph/schema/property_schema.py +155 -0
  133. aiecs/domain/knowledge_graph/schema/relation_type.py +171 -0
  134. aiecs/domain/knowledge_graph/schema/schema_manager.py +496 -0
  135. aiecs/domain/knowledge_graph/schema/type_enums.py +205 -0
  136. aiecs/domain/task/__init__.py +13 -0
  137. aiecs/domain/task/dsl_processor.py +613 -0
  138. aiecs/domain/task/model.py +62 -0
  139. aiecs/domain/task/task_context.py +268 -0
  140. aiecs/infrastructure/__init__.py +24 -0
  141. aiecs/infrastructure/graph_storage/__init__.py +11 -0
  142. aiecs/infrastructure/graph_storage/base.py +601 -0
  143. aiecs/infrastructure/graph_storage/batch_operations.py +449 -0
  144. aiecs/infrastructure/graph_storage/cache.py +429 -0
  145. aiecs/infrastructure/graph_storage/distributed.py +226 -0
  146. aiecs/infrastructure/graph_storage/error_handling.py +390 -0
  147. aiecs/infrastructure/graph_storage/graceful_degradation.py +306 -0
  148. aiecs/infrastructure/graph_storage/health_checks.py +378 -0
  149. aiecs/infrastructure/graph_storage/in_memory.py +514 -0
  150. aiecs/infrastructure/graph_storage/index_optimization.py +483 -0
  151. aiecs/infrastructure/graph_storage/lazy_loading.py +410 -0
  152. aiecs/infrastructure/graph_storage/metrics.py +357 -0
  153. aiecs/infrastructure/graph_storage/migration.py +413 -0
  154. aiecs/infrastructure/graph_storage/pagination.py +471 -0
  155. aiecs/infrastructure/graph_storage/performance_monitoring.py +466 -0
  156. aiecs/infrastructure/graph_storage/postgres.py +871 -0
  157. aiecs/infrastructure/graph_storage/query_optimizer.py +635 -0
  158. aiecs/infrastructure/graph_storage/schema_cache.py +290 -0
  159. aiecs/infrastructure/graph_storage/sqlite.py +623 -0
  160. aiecs/infrastructure/graph_storage/streaming.py +495 -0
  161. aiecs/infrastructure/messaging/__init__.py +13 -0
  162. aiecs/infrastructure/messaging/celery_task_manager.py +383 -0
  163. aiecs/infrastructure/messaging/websocket_manager.py +298 -0
  164. aiecs/infrastructure/monitoring/__init__.py +34 -0
  165. aiecs/infrastructure/monitoring/executor_metrics.py +174 -0
  166. aiecs/infrastructure/monitoring/global_metrics_manager.py +213 -0
  167. aiecs/infrastructure/monitoring/structured_logger.py +48 -0
  168. aiecs/infrastructure/monitoring/tracing_manager.py +410 -0
  169. aiecs/infrastructure/persistence/__init__.py +24 -0
  170. aiecs/infrastructure/persistence/context_engine_client.py +187 -0
  171. aiecs/infrastructure/persistence/database_manager.py +333 -0
  172. aiecs/infrastructure/persistence/file_storage.py +754 -0
  173. aiecs/infrastructure/persistence/redis_client.py +220 -0
  174. aiecs/llm/__init__.py +86 -0
  175. aiecs/llm/callbacks/__init__.py +11 -0
  176. aiecs/llm/callbacks/custom_callbacks.py +264 -0
  177. aiecs/llm/client_factory.py +420 -0
  178. aiecs/llm/clients/__init__.py +33 -0
  179. aiecs/llm/clients/base_client.py +193 -0
  180. aiecs/llm/clients/googleai_client.py +181 -0
  181. aiecs/llm/clients/openai_client.py +131 -0
  182. aiecs/llm/clients/vertex_client.py +437 -0
  183. aiecs/llm/clients/xai_client.py +184 -0
  184. aiecs/llm/config/__init__.py +51 -0
  185. aiecs/llm/config/config_loader.py +275 -0
  186. aiecs/llm/config/config_validator.py +236 -0
  187. aiecs/llm/config/model_config.py +151 -0
  188. aiecs/llm/utils/__init__.py +10 -0
  189. aiecs/llm/utils/validate_config.py +91 -0
  190. aiecs/main.py +363 -0
  191. aiecs/scripts/__init__.py +3 -0
  192. aiecs/scripts/aid/VERSION_MANAGEMENT.md +97 -0
  193. aiecs/scripts/aid/__init__.py +19 -0
  194. aiecs/scripts/aid/version_manager.py +215 -0
  195. aiecs/scripts/dependance_check/DEPENDENCY_SYSTEM_SUMMARY.md +242 -0
  196. aiecs/scripts/dependance_check/README_DEPENDENCY_CHECKER.md +310 -0
  197. aiecs/scripts/dependance_check/__init__.py +17 -0
  198. aiecs/scripts/dependance_check/dependency_checker.py +938 -0
  199. aiecs/scripts/dependance_check/dependency_fixer.py +391 -0
  200. aiecs/scripts/dependance_check/download_nlp_data.py +396 -0
  201. aiecs/scripts/dependance_check/quick_dependency_check.py +270 -0
  202. aiecs/scripts/dependance_check/setup_nlp_data.sh +217 -0
  203. aiecs/scripts/dependance_patch/__init__.py +7 -0
  204. aiecs/scripts/dependance_patch/fix_weasel/README_WEASEL_PATCH.md +126 -0
  205. aiecs/scripts/dependance_patch/fix_weasel/__init__.py +11 -0
  206. aiecs/scripts/dependance_patch/fix_weasel/fix_weasel_validator.py +128 -0
  207. aiecs/scripts/dependance_patch/fix_weasel/fix_weasel_validator.sh +82 -0
  208. aiecs/scripts/dependance_patch/fix_weasel/patch_weasel_library.sh +188 -0
  209. aiecs/scripts/dependance_patch/fix_weasel/run_weasel_patch.sh +41 -0
  210. aiecs/scripts/tools_develop/README.md +449 -0
  211. aiecs/scripts/tools_develop/TOOL_AUTO_DISCOVERY.md +234 -0
  212. aiecs/scripts/tools_develop/__init__.py +21 -0
  213. aiecs/scripts/tools_develop/check_type_annotations.py +259 -0
  214. aiecs/scripts/tools_develop/validate_tool_schemas.py +422 -0
  215. aiecs/scripts/tools_develop/verify_tools.py +356 -0
  216. aiecs/tasks/__init__.py +1 -0
  217. aiecs/tasks/worker.py +172 -0
  218. aiecs/tools/__init__.py +299 -0
  219. aiecs/tools/apisource/__init__.py +99 -0
  220. aiecs/tools/apisource/intelligence/__init__.py +19 -0
  221. aiecs/tools/apisource/intelligence/data_fusion.py +381 -0
  222. aiecs/tools/apisource/intelligence/query_analyzer.py +413 -0
  223. aiecs/tools/apisource/intelligence/search_enhancer.py +388 -0
  224. aiecs/tools/apisource/monitoring/__init__.py +9 -0
  225. aiecs/tools/apisource/monitoring/metrics.py +303 -0
  226. aiecs/tools/apisource/providers/__init__.py +115 -0
  227. aiecs/tools/apisource/providers/base.py +664 -0
  228. aiecs/tools/apisource/providers/census.py +401 -0
  229. aiecs/tools/apisource/providers/fred.py +564 -0
  230. aiecs/tools/apisource/providers/newsapi.py +412 -0
  231. aiecs/tools/apisource/providers/worldbank.py +357 -0
  232. aiecs/tools/apisource/reliability/__init__.py +12 -0
  233. aiecs/tools/apisource/reliability/error_handler.py +375 -0
  234. aiecs/tools/apisource/reliability/fallback_strategy.py +391 -0
  235. aiecs/tools/apisource/tool.py +850 -0
  236. aiecs/tools/apisource/utils/__init__.py +9 -0
  237. aiecs/tools/apisource/utils/validators.py +338 -0
  238. aiecs/tools/base_tool.py +201 -0
  239. aiecs/tools/docs/__init__.py +121 -0
  240. aiecs/tools/docs/ai_document_orchestrator.py +599 -0
  241. aiecs/tools/docs/ai_document_writer_orchestrator.py +2403 -0
  242. aiecs/tools/docs/content_insertion_tool.py +1333 -0
  243. aiecs/tools/docs/document_creator_tool.py +1317 -0
  244. aiecs/tools/docs/document_layout_tool.py +1166 -0
  245. aiecs/tools/docs/document_parser_tool.py +994 -0
  246. aiecs/tools/docs/document_writer_tool.py +1818 -0
  247. aiecs/tools/knowledge_graph/__init__.py +17 -0
  248. aiecs/tools/knowledge_graph/graph_reasoning_tool.py +734 -0
  249. aiecs/tools/knowledge_graph/graph_search_tool.py +923 -0
  250. aiecs/tools/knowledge_graph/kg_builder_tool.py +476 -0
  251. aiecs/tools/langchain_adapter.py +542 -0
  252. aiecs/tools/schema_generator.py +275 -0
  253. aiecs/tools/search_tool/__init__.py +100 -0
  254. aiecs/tools/search_tool/analyzers.py +589 -0
  255. aiecs/tools/search_tool/cache.py +260 -0
  256. aiecs/tools/search_tool/constants.py +128 -0
  257. aiecs/tools/search_tool/context.py +216 -0
  258. aiecs/tools/search_tool/core.py +749 -0
  259. aiecs/tools/search_tool/deduplicator.py +123 -0
  260. aiecs/tools/search_tool/error_handler.py +271 -0
  261. aiecs/tools/search_tool/metrics.py +371 -0
  262. aiecs/tools/search_tool/rate_limiter.py +178 -0
  263. aiecs/tools/search_tool/schemas.py +277 -0
  264. aiecs/tools/statistics/__init__.py +80 -0
  265. aiecs/tools/statistics/ai_data_analysis_orchestrator.py +643 -0
  266. aiecs/tools/statistics/ai_insight_generator_tool.py +505 -0
  267. aiecs/tools/statistics/ai_report_orchestrator_tool.py +694 -0
  268. aiecs/tools/statistics/data_loader_tool.py +564 -0
  269. aiecs/tools/statistics/data_profiler_tool.py +658 -0
  270. aiecs/tools/statistics/data_transformer_tool.py +573 -0
  271. aiecs/tools/statistics/data_visualizer_tool.py +495 -0
  272. aiecs/tools/statistics/model_trainer_tool.py +487 -0
  273. aiecs/tools/statistics/statistical_analyzer_tool.py +459 -0
  274. aiecs/tools/task_tools/__init__.py +86 -0
  275. aiecs/tools/task_tools/chart_tool.py +732 -0
  276. aiecs/tools/task_tools/classfire_tool.py +922 -0
  277. aiecs/tools/task_tools/image_tool.py +447 -0
  278. aiecs/tools/task_tools/office_tool.py +684 -0
  279. aiecs/tools/task_tools/pandas_tool.py +635 -0
  280. aiecs/tools/task_tools/report_tool.py +635 -0
  281. aiecs/tools/task_tools/research_tool.py +392 -0
  282. aiecs/tools/task_tools/scraper_tool.py +715 -0
  283. aiecs/tools/task_tools/stats_tool.py +688 -0
  284. aiecs/tools/temp_file_manager.py +130 -0
  285. aiecs/tools/tool_executor/__init__.py +37 -0
  286. aiecs/tools/tool_executor/tool_executor.py +881 -0
  287. aiecs/utils/LLM_output_structor.py +445 -0
  288. aiecs/utils/__init__.py +34 -0
  289. aiecs/utils/base_callback.py +47 -0
  290. aiecs/utils/cache_provider.py +695 -0
  291. aiecs/utils/execution_utils.py +184 -0
  292. aiecs/utils/logging.py +1 -0
  293. aiecs/utils/prompt_loader.py +14 -0
  294. aiecs/utils/token_usage_repository.py +323 -0
  295. aiecs/ws/__init__.py +0 -0
  296. aiecs/ws/socket_server.py +52 -0
  297. aiecs-1.5.1.dist-info/METADATA +608 -0
  298. aiecs-1.5.1.dist-info/RECORD +302 -0
  299. aiecs-1.5.1.dist-info/WHEEL +5 -0
  300. aiecs-1.5.1.dist-info/entry_points.txt +10 -0
  301. aiecs-1.5.1.dist-info/licenses/LICENSE +225 -0
  302. aiecs-1.5.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,244 @@
1
+ """
2
+ spaCy NER-based Entity Extractor
3
+
4
+ Extracts entities using spaCy's Named Entity Recognition.
5
+ Fast, offline, and cost-free alternative to LLM extraction.
6
+ """
7
+
8
+ from typing import List, Optional
9
+ import spacy
10
+ from spacy.language import Language
11
+
12
+ from aiecs.application.knowledge_graph.extractors.base import EntityExtractor
13
+ from aiecs.domain.knowledge_graph.models.entity import Entity
14
+
15
+
16
+ class NEREntityExtractor(EntityExtractor):
17
+ """
18
+ Extract entities using spaCy Named Entity Recognition
19
+
20
+ This extractor uses spaCy's pre-trained NER models to identify entities.
21
+ It's fast, free, and works offline, but limited to standard NER types.
22
+
23
+ Features:
24
+ - Fast extraction (no API calls)
25
+ - Works offline
26
+ - No cost
27
+ - Standard NER types (PERSON, ORG, GPE, LOC, DATE, etc.)
28
+
29
+ Limitations:
30
+ - Only standard entity types (no custom types)
31
+ - Limited property extraction (mainly just entity text)
32
+ - Lower quality than LLM extraction
33
+
34
+ Use Cases:
35
+ - Development and testing
36
+ - Cost-sensitive scenarios
37
+ - High-volume extraction where LLM is too expensive
38
+ - Baseline for comparison
39
+
40
+ Example:
41
+ ```python
42
+ extractor = NEREntityExtractor(model="en_core_web_sm")
43
+
44
+ entities = await extractor.extract_entities(
45
+ "Alice works at Tech Corp in San Francisco."
46
+ )
47
+ # Returns: [
48
+ # Entity(type="Person", properties={"name": "Alice", "text": "Alice"}),
49
+ # Entity(type="Organization", properties={"name": "Tech Corp", "text": "Tech Corp"}),
50
+ # Entity(type="Location", properties={"name": "San Francisco", "text": "San Francisco"})
51
+ # ]
52
+ ```
53
+ """
54
+
55
+ # Mapping from spaCy NER labels to generic entity types
56
+ LABEL_MAPPING = {
57
+ "PERSON": "Person",
58
+ "PER": "Person",
59
+ "ORG": "Organization",
60
+ "ORGANIZATION": "Organization",
61
+ "GPE": "Location", # Geo-Political Entity
62
+ "LOC": "Location",
63
+ "LOCATION": "Location",
64
+ "FAC": "Facility",
65
+ "FACILITY": "Facility",
66
+ "PRODUCT": "Product",
67
+ "EVENT": "Event",
68
+ "WORK_OF_ART": "WorkOfArt",
69
+ "LAW": "Law",
70
+ "LANGUAGE": "Language",
71
+ "DATE": "Date",
72
+ "TIME": "Time",
73
+ "PERCENT": "Percentage",
74
+ "MONEY": "Money",
75
+ "QUANTITY": "Quantity",
76
+ "ORDINAL": "Ordinal",
77
+ "CARDINAL": "Cardinal",
78
+ }
79
+
80
+ def __init__(
81
+ self,
82
+ model: str = "en_core_web_sm",
83
+ disable_components: Optional[List[str]] = None,
84
+ ):
85
+ """
86
+ Initialize NER entity extractor
87
+
88
+ Args:
89
+ model: spaCy model name (default: "en_core_web_sm")
90
+ Available models:
91
+ - en_core_web_sm: Small English model (~13MB)
92
+ - en_core_web_md: Medium English model (~40MB)
93
+ - en_core_web_lg: Large English model (~560MB)
94
+ disable_components: spaCy pipeline components to disable (for speed)
95
+ Default: disable all except NER
96
+ """
97
+ self.model_name = model
98
+
99
+ try:
100
+ # Load spaCy model
101
+ if disable_components is None:
102
+ # Disable everything except NER for speed
103
+ disable_components = [
104
+ "tok2vec",
105
+ "tagger",
106
+ "parser",
107
+ "attribute_ruler",
108
+ "lemmatizer",
109
+ ]
110
+
111
+ self.nlp: Language = spacy.load(model, disable=disable_components)
112
+ except OSError as e:
113
+ raise RuntimeError(
114
+ f"spaCy model '{model}' not found. "
115
+ f"Install it with: python -m spacy download {model}"
116
+ ) from e
117
+
118
+ async def extract_entities(
119
+ self, text: str, entity_types: Optional[List[str]] = None, **kwargs
120
+ ) -> List[Entity]:
121
+ """
122
+ Extract entities from text using spaCy NER
123
+
124
+ Args:
125
+ text: Input text to extract entities from
126
+ entity_types: Optional filter for specific entity types
127
+ (will be matched against LABEL_MAPPING values)
128
+ **kwargs: Additional parameters (unused for NER)
129
+
130
+ Returns:
131
+ List of extracted Entity objects
132
+
133
+ Raises:
134
+ ValueError: If text is empty
135
+ """
136
+ if not text or not text.strip():
137
+ raise ValueError("Input text cannot be empty")
138
+
139
+ # Process text with spaCy
140
+ doc = self.nlp(text)
141
+
142
+ # Extract entities
143
+ entities = []
144
+ seen_texts = set() # Simple deduplication within same text
145
+
146
+ for ent in doc.ents:
147
+ # Map spaCy label to generic entity type
148
+ entity_type = self.LABEL_MAPPING.get(ent.label_, ent.label_)
149
+
150
+ # Filter by entity type if requested
151
+ if entity_types and entity_type not in entity_types:
152
+ continue
153
+
154
+ # Simple deduplication: skip if we've seen this exact text already
155
+ entity_text = ent.text.strip()
156
+ if entity_text in seen_texts:
157
+ continue
158
+ seen_texts.add(entity_text)
159
+
160
+ # Create entity
161
+ entity = Entity(
162
+ id=self._generate_entity_id(entity_type, entity_text),
163
+ entity_type=entity_type,
164
+ properties={
165
+ "name": entity_text,
166
+ "text": entity_text,
167
+ "label": ent.label_, # Original spaCy label
168
+ "start_char": ent.start_char,
169
+ "end_char": ent.end_char,
170
+ "_extraction_confidence": self._estimate_confidence(ent),
171
+ },
172
+ )
173
+
174
+ entities.append(entity)
175
+
176
+ return entities
177
+
178
+ def _generate_entity_id(self, entity_type: str, text: str) -> str:
179
+ """
180
+ Generate a unique ID for an entity
181
+
182
+ Args:
183
+ entity_type: Entity type name
184
+ text: Entity text
185
+
186
+ Returns:
187
+ Unique entity ID string
188
+ """
189
+ # Create deterministic ID from type + text
190
+ normalized = f"{entity_type}_{text}".lower().replace(" ", "_")
191
+ # Add short hash for uniqueness
192
+ import hashlib
193
+
194
+ hash_suffix = hashlib.md5(normalized.encode()).hexdigest()[:8]
195
+ return f"{normalized}_{hash_suffix}"
196
+
197
+ def _estimate_confidence(self, ent) -> float:
198
+ """
199
+ Estimate confidence for NER extraction
200
+
201
+ spaCy doesn't provide confidence scores directly, so we use heuristics:
202
+ - Longer entities are generally more confident
203
+ - Entities with more context are more confident
204
+ - Capitalized entities (proper nouns) are more confident
205
+
206
+ Args:
207
+ ent: spaCy entity
208
+
209
+ Returns:
210
+ Confidence score (0.0-1.0)
211
+ """
212
+ # Base confidence
213
+ confidence = 0.7
214
+
215
+ # Adjust based on entity length
216
+ if len(ent.text) > 20:
217
+ confidence += 0.1
218
+ elif len(ent.text) < 3:
219
+ confidence -= 0.2
220
+
221
+ # Adjust based on capitalization (proper nouns)
222
+ if ent.text[0].isupper():
223
+ confidence += 0.1
224
+
225
+ # Clamp to [0.0, 1.0]
226
+ return max(0.0, min(1.0, confidence))
227
+
228
+ def get_supported_types(self) -> List[str]:
229
+ """
230
+ Get list of entity types that this extractor can produce
231
+
232
+ Returns:
233
+ List of entity type names
234
+ """
235
+ return list(set(self.LABEL_MAPPING.values()))
236
+
237
+ def get_available_labels(self) -> List[str]:
238
+ """
239
+ Get list of NER labels available in the loaded model
240
+
241
+ Returns:
242
+ List of spaCy NER labels
243
+ """
244
+ return self.nlp.get_pipe("ner").labels
@@ -0,0 +1,23 @@
1
+ """
2
+ Knowledge Fusion Components
3
+
4
+ Components for deduplicating, merging, and linking entities across documents.
5
+ """
6
+
7
+ from aiecs.application.knowledge_graph.fusion.entity_deduplicator import (
8
+ EntityDeduplicator,
9
+ )
10
+ from aiecs.application.knowledge_graph.fusion.entity_linker import EntityLinker
11
+ from aiecs.application.knowledge_graph.fusion.relation_deduplicator import (
12
+ RelationDeduplicator,
13
+ )
14
+ from aiecs.application.knowledge_graph.fusion.knowledge_fusion import (
15
+ KnowledgeFusion,
16
+ )
17
+
18
+ __all__ = [
19
+ "EntityDeduplicator",
20
+ "EntityLinker",
21
+ "RelationDeduplicator",
22
+ "KnowledgeFusion",
23
+ ]
@@ -0,0 +1,387 @@
1
+ """
2
+ Entity Deduplicator
3
+
4
+ Identifies and merges duplicate entities based on similarity matching.
5
+ """
6
+
7
+ from typing import List, Dict, Tuple, Set
8
+ from difflib import SequenceMatcher
9
+ from aiecs.domain.knowledge_graph.models.entity import Entity
10
+
11
+
12
+ class EntityDeduplicator:
13
+ """
14
+ Deduplicate entities based on similarity
15
+
16
+ When extracting entities from text, it's common to get duplicates:
17
+ - "Apple Inc." vs "Apple" vs "Apple Incorporated"
18
+ - "John Smith" vs "J. Smith" vs "Smith, John"
19
+ - "New York" vs "New York City" vs "NYC"
20
+
21
+ This class identifies such duplicates and merges them into canonical entities.
22
+
23
+ Features:
24
+ - Name-based fuzzy matching
25
+ - Type-aware matching (only match entities of same type)
26
+ - Property-based matching (use properties to improve matching)
27
+ - Configurable similarity threshold
28
+ - Embedding-based matching (when embeddings available)
29
+
30
+ Example:
31
+ ```python
32
+ deduplicator = EntityDeduplicator(similarity_threshold=0.85)
33
+
34
+ entities = [
35
+ Entity(type="Company", properties={"name": "Apple Inc."}),
36
+ Entity(type="Company", properties={"name": "Apple"}),
37
+ Entity(type="Company", properties={"name": "Microsoft"})
38
+ ]
39
+
40
+ deduplicated = await deduplicator.deduplicate(entities)
41
+ # Returns: [
42
+ # Entity(type="Company", properties={"name": "Apple Inc.", "_aliases": ["Apple"]}),
43
+ # Entity(type="Company", properties={"name": "Microsoft"})
44
+ # ]
45
+ ```
46
+ """
47
+
48
+ def __init__(
49
+ self,
50
+ similarity_threshold: float = 0.85,
51
+ use_embeddings: bool = True,
52
+ embedding_threshold: float = 0.90,
53
+ ):
54
+ """
55
+ Initialize entity deduplicator
56
+
57
+ Args:
58
+ similarity_threshold: Minimum similarity score to consider entities as duplicates (0.0-1.0)
59
+ use_embeddings: Whether to use embeddings for similarity (if available)
60
+ embedding_threshold: Minimum embedding similarity for duplicates (0.0-1.0)
61
+ """
62
+ self.similarity_threshold = similarity_threshold
63
+ self.use_embeddings = use_embeddings
64
+ self.embedding_threshold = embedding_threshold
65
+
66
+ async def deduplicate(self, entities: List[Entity]) -> List[Entity]:
67
+ """
68
+ Deduplicate a list of entities
69
+
70
+ Args:
71
+ entities: List of entities to deduplicate
72
+
73
+ Returns:
74
+ List of deduplicated entities (with merged properties and aliases)
75
+ """
76
+ if not entities:
77
+ return []
78
+
79
+ # Group entities by type (only match within same type)
80
+ entities_by_type: Dict[str, List[Entity]] = {}
81
+ for entity in entities:
82
+ if entity.entity_type not in entities_by_type:
83
+ entities_by_type[entity.entity_type] = []
84
+ entities_by_type[entity.entity_type].append(entity)
85
+
86
+ # Deduplicate within each type
87
+ deduplicated_entities = []
88
+ for entity_type, type_entities in entities_by_type.items():
89
+ deduped = await self._deduplicate_type_group(type_entities)
90
+ deduplicated_entities.extend(deduped)
91
+
92
+ return deduplicated_entities
93
+
94
+ async def _deduplicate_type_group(self, entities: List[Entity]) -> List[Entity]:
95
+ """
96
+ Deduplicate entities of the same type
97
+
98
+ Algorithm:
99
+ 1. Build similarity matrix between all pairs
100
+ 2. Find clusters of similar entities (connected components)
101
+ 3. Merge each cluster into a single canonical entity
102
+
103
+ Args:
104
+ entities: List of entities (all same type)
105
+
106
+ Returns:
107
+ List of deduplicated entities
108
+ """
109
+ if len(entities) <= 1:
110
+ return entities
111
+
112
+ # Build similarity graph
113
+ n = len(entities)
114
+ similar_pairs: Set[Tuple[int, int]] = set()
115
+
116
+ for i in range(n):
117
+ for j in range(i + 1, n):
118
+ similarity = await self._compute_similarity(entities[i], entities[j])
119
+ if similarity >= self.similarity_threshold:
120
+ similar_pairs.add((i, j))
121
+
122
+ # Find connected components (clusters of similar entities)
123
+ clusters = self._find_clusters(n, similar_pairs)
124
+
125
+ # Merge each cluster into canonical entity
126
+ deduplicated = []
127
+ for cluster in clusters:
128
+ cluster_entities = [entities[idx] for idx in cluster]
129
+ merged_entity = self._merge_entities(cluster_entities)
130
+ deduplicated.append(merged_entity)
131
+
132
+ return deduplicated
133
+
134
+ async def _compute_similarity(self, entity1: Entity, entity2: Entity) -> float:
135
+ """
136
+ Compute similarity between two entities
137
+
138
+ Uses multiple signals:
139
+ 1. Name similarity (fuzzy string matching)
140
+ 2. Property overlap
141
+ 3. Embedding similarity (if available)
142
+
143
+ Args:
144
+ entity1: First entity
145
+ entity2: Second entity
146
+
147
+ Returns:
148
+ Similarity score (0.0-1.0)
149
+ """
150
+ # Get entity names
151
+ name1 = self._get_entity_name(entity1)
152
+ name2 = self._get_entity_name(entity2)
153
+
154
+ if not name1 or not name2:
155
+ return 0.0
156
+
157
+ # 1. Name-based similarity
158
+ name_similarity = self._string_similarity(name1, name2)
159
+
160
+ # 2. Property overlap
161
+ property_similarity = self._property_similarity(entity1.properties, entity2.properties)
162
+
163
+ # 3. Embedding similarity (if available)
164
+ embedding_similarity = 0.0
165
+ if self.use_embeddings and entity1.embedding and entity2.embedding:
166
+ embedding_similarity = self._cosine_similarity(entity1.embedding, entity2.embedding)
167
+
168
+ # Weighted combination
169
+ if entity1.embedding and entity2.embedding and self.use_embeddings:
170
+ # If embeddings available, give them high weight
171
+ return 0.3 * name_similarity + 0.2 * property_similarity + 0.5 * embedding_similarity
172
+ else:
173
+ # No embeddings, rely on name and properties
174
+ return 0.7 * name_similarity + 0.3 * property_similarity
175
+
176
+ def _get_entity_name(self, entity: Entity) -> str:
177
+ """Extract entity name from properties"""
178
+ return (
179
+ entity.properties.get("name")
180
+ or entity.properties.get("title")
181
+ or entity.properties.get("text")
182
+ or ""
183
+ )
184
+
185
+ def _string_similarity(self, str1: str, str2: str) -> float:
186
+ """
187
+ Compute string similarity using multiple methods
188
+
189
+ Combines:
190
+ - Exact match (normalized)
191
+ - SequenceMatcher ratio
192
+ - Token overlap (for multi-word entities)
193
+
194
+ Args:
195
+ str1: First string
196
+ str2: Second string
197
+
198
+ Returns:
199
+ Similarity score (0.0-1.0)
200
+ """
201
+ # Normalize strings
202
+ s1 = str1.lower().strip()
203
+ s2 = str2.lower().strip()
204
+
205
+ # Exact match
206
+ if s1 == s2:
207
+ return 1.0
208
+
209
+ # One is substring of other
210
+ if s1 in s2 or s2 in s1:
211
+ return 0.95
212
+
213
+ # Sequence matcher
214
+ seq_similarity = SequenceMatcher(None, s1, s2).ratio()
215
+
216
+ # Token overlap (for multi-word names)
217
+ tokens1 = set(s1.split())
218
+ tokens2 = set(s2.split())
219
+ if tokens1 and tokens2:
220
+ token_overlap = len(tokens1 & tokens2) / len(tokens1 | tokens2)
221
+ else:
222
+ token_overlap = 0.0
223
+
224
+ # Combine
225
+ return max(seq_similarity, token_overlap)
226
+
227
+ def _property_similarity(self, props1: Dict, props2: Dict) -> float:
228
+ """
229
+ Compute similarity based on property overlap
230
+
231
+ Args:
232
+ props1: Properties of first entity
233
+ props2: Properties of second entity
234
+
235
+ Returns:
236
+ Similarity score (0.0-1.0)
237
+ """
238
+ # Remove internal properties
239
+ keys1 = {k for k in props1.keys() if not k.startswith("_")}
240
+ keys2 = {k for k in props2.keys() if not k.startswith("_")}
241
+
242
+ if not keys1 and not keys2:
243
+ return 0.5 # No properties to compare
244
+
245
+ # Key overlap
246
+ common_keys = keys1 & keys2
247
+ all_keys = keys1 | keys2
248
+
249
+ if not all_keys:
250
+ return 0.5
251
+
252
+ key_overlap = len(common_keys) / len(all_keys)
253
+
254
+ # Value similarity for common keys
255
+ value_matches = 0
256
+ for key in common_keys:
257
+ val1 = str(props1[key]).lower()
258
+ val2 = str(props2[key]).lower()
259
+ if val1 == val2:
260
+ value_matches += 1
261
+
262
+ value_similarity = value_matches / len(common_keys) if common_keys else 0.0
263
+
264
+ # Combine
265
+ return 0.5 * key_overlap + 0.5 * value_similarity
266
+
267
+ def _cosine_similarity(self, vec1: List[float], vec2: List[float]) -> float:
268
+ """
269
+ Compute cosine similarity between two vectors
270
+
271
+ Args:
272
+ vec1: First vector
273
+ vec2: Second vector
274
+
275
+ Returns:
276
+ Cosine similarity (0.0-1.0)
277
+ """
278
+ if len(vec1) != len(vec2):
279
+ return 0.0
280
+
281
+ dot_product = sum(a * b for a, b in zip(vec1, vec2))
282
+ magnitude1 = sum(a * a for a in vec1) ** 0.5
283
+ magnitude2 = sum(b * b for b in vec2) ** 0.5
284
+
285
+ if magnitude1 == 0 or magnitude2 == 0:
286
+ return 0.0
287
+
288
+ # Cosine similarity ranges from -1 to 1, normalize to 0 to 1
289
+ similarity = dot_product / (magnitude1 * magnitude2)
290
+ return (similarity + 1) / 2
291
+
292
+ def _find_clusters(self, n: int, edges: Set[Tuple[int, int]]) -> List[List[int]]:
293
+ """
294
+ Find connected components using Union-Find
295
+
296
+ Args:
297
+ n: Number of nodes
298
+ edges: Set of edges (i, j) indicating similarity
299
+
300
+ Returns:
301
+ List of clusters, where each cluster is a list of node indices
302
+ """
303
+ # Union-Find data structure
304
+ parent = list(range(n))
305
+
306
+ def find(x):
307
+ if parent[x] != x:
308
+ parent[x] = find(parent[x]) # Path compression
309
+ return parent[x]
310
+
311
+ def union(x, y):
312
+ px, py = find(x), find(y)
313
+ if px != py:
314
+ parent[px] = py
315
+
316
+ # Build connected components
317
+ for i, j in edges:
318
+ union(i, j)
319
+
320
+ # Group by root
321
+ clusters_dict: Dict[int, List[int]] = {}
322
+ for i in range(n):
323
+ root = find(i)
324
+ if root not in clusters_dict:
325
+ clusters_dict[root] = []
326
+ clusters_dict[root].append(i)
327
+
328
+ return list(clusters_dict.values())
329
+
330
+ def _merge_entities(self, entities: List[Entity]) -> Entity:
331
+ """
332
+ Merge a cluster of similar entities into one canonical entity
333
+
334
+ Strategy:
335
+ - Use the first entity as base
336
+ - Merge all properties (prefer non-empty values)
337
+ - Store alternative names as aliases
338
+ - Keep highest confidence score
339
+
340
+ Args:
341
+ entities: List of entities to merge
342
+
343
+ Returns:
344
+ Merged canonical entity
345
+ """
346
+ if len(entities) == 1:
347
+ return entities[0]
348
+
349
+ # Use first entity as base
350
+ canonical = entities[0]
351
+
352
+ # Collect all names as aliases
353
+ aliases = set()
354
+ for entity in entities:
355
+ name = self._get_entity_name(entity)
356
+ if name and name != self._get_entity_name(canonical):
357
+ aliases.add(name)
358
+
359
+ # Merge properties (prefer non-empty, non-None values)
360
+ merged_properties = dict(canonical.properties)
361
+
362
+ for entity in entities[1:]:
363
+ for key, value in entity.properties.items():
364
+ if key not in merged_properties or not merged_properties[key]:
365
+ merged_properties[key] = value
366
+
367
+ # Add aliases
368
+ if aliases:
369
+ merged_properties["_aliases"] = list(aliases)
370
+
371
+ # Take highest confidence
372
+ confidences = [e.properties.get("_extraction_confidence", 0.5) for e in entities]
373
+ merged_properties["_extraction_confidence"] = max(confidences)
374
+
375
+ # Track merge count
376
+ merged_properties["_merged_count"] = len(entities)
377
+
378
+ # Create merged entity
379
+ merged_entity = Entity(
380
+ id=canonical.id,
381
+ entity_type=canonical.entity_type,
382
+ properties=merged_properties,
383
+ embedding=canonical.embedding,
384
+ source=canonical.source,
385
+ )
386
+
387
+ return merged_entity