aiecs 1.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aiecs/__init__.py +72 -0
- aiecs/__main__.py +41 -0
- aiecs/aiecs_client.py +469 -0
- aiecs/application/__init__.py +10 -0
- aiecs/application/executors/__init__.py +10 -0
- aiecs/application/executors/operation_executor.py +363 -0
- aiecs/application/knowledge_graph/__init__.py +7 -0
- aiecs/application/knowledge_graph/builder/__init__.py +37 -0
- aiecs/application/knowledge_graph/builder/document_builder.py +375 -0
- aiecs/application/knowledge_graph/builder/graph_builder.py +356 -0
- aiecs/application/knowledge_graph/builder/schema_mapping.py +531 -0
- aiecs/application/knowledge_graph/builder/structured_pipeline.py +443 -0
- aiecs/application/knowledge_graph/builder/text_chunker.py +319 -0
- aiecs/application/knowledge_graph/extractors/__init__.py +27 -0
- aiecs/application/knowledge_graph/extractors/base.py +100 -0
- aiecs/application/knowledge_graph/extractors/llm_entity_extractor.py +327 -0
- aiecs/application/knowledge_graph/extractors/llm_relation_extractor.py +349 -0
- aiecs/application/knowledge_graph/extractors/ner_entity_extractor.py +244 -0
- aiecs/application/knowledge_graph/fusion/__init__.py +23 -0
- aiecs/application/knowledge_graph/fusion/entity_deduplicator.py +387 -0
- aiecs/application/knowledge_graph/fusion/entity_linker.py +343 -0
- aiecs/application/knowledge_graph/fusion/knowledge_fusion.py +580 -0
- aiecs/application/knowledge_graph/fusion/relation_deduplicator.py +189 -0
- aiecs/application/knowledge_graph/pattern_matching/__init__.py +21 -0
- aiecs/application/knowledge_graph/pattern_matching/pattern_matcher.py +344 -0
- aiecs/application/knowledge_graph/pattern_matching/query_executor.py +378 -0
- aiecs/application/knowledge_graph/profiling/__init__.py +12 -0
- aiecs/application/knowledge_graph/profiling/query_plan_visualizer.py +199 -0
- aiecs/application/knowledge_graph/profiling/query_profiler.py +223 -0
- aiecs/application/knowledge_graph/reasoning/__init__.py +27 -0
- aiecs/application/knowledge_graph/reasoning/evidence_synthesis.py +347 -0
- aiecs/application/knowledge_graph/reasoning/inference_engine.py +504 -0
- aiecs/application/knowledge_graph/reasoning/logic_form_parser.py +167 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/__init__.py +79 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_builder.py +513 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_nodes.py +630 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_validator.py +654 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/error_handler.py +477 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/parser.py +390 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/query_context.py +217 -0
- aiecs/application/knowledge_graph/reasoning/logic_query_integration.py +169 -0
- aiecs/application/knowledge_graph/reasoning/query_planner.py +872 -0
- aiecs/application/knowledge_graph/reasoning/reasoning_engine.py +554 -0
- aiecs/application/knowledge_graph/retrieval/__init__.py +19 -0
- aiecs/application/knowledge_graph/retrieval/retrieval_strategies.py +596 -0
- aiecs/application/knowledge_graph/search/__init__.py +59 -0
- aiecs/application/knowledge_graph/search/hybrid_search.py +423 -0
- aiecs/application/knowledge_graph/search/reranker.py +295 -0
- aiecs/application/knowledge_graph/search/reranker_strategies.py +553 -0
- aiecs/application/knowledge_graph/search/text_similarity.py +398 -0
- aiecs/application/knowledge_graph/traversal/__init__.py +15 -0
- aiecs/application/knowledge_graph/traversal/enhanced_traversal.py +329 -0
- aiecs/application/knowledge_graph/traversal/path_scorer.py +269 -0
- aiecs/application/knowledge_graph/validators/__init__.py +13 -0
- aiecs/application/knowledge_graph/validators/relation_validator.py +189 -0
- aiecs/application/knowledge_graph/visualization/__init__.py +11 -0
- aiecs/application/knowledge_graph/visualization/graph_visualizer.py +321 -0
- aiecs/common/__init__.py +9 -0
- aiecs/common/knowledge_graph/__init__.py +17 -0
- aiecs/common/knowledge_graph/runnable.py +484 -0
- aiecs/config/__init__.py +16 -0
- aiecs/config/config.py +498 -0
- aiecs/config/graph_config.py +137 -0
- aiecs/config/registry.py +23 -0
- aiecs/core/__init__.py +46 -0
- aiecs/core/interface/__init__.py +34 -0
- aiecs/core/interface/execution_interface.py +152 -0
- aiecs/core/interface/storage_interface.py +171 -0
- aiecs/domain/__init__.py +289 -0
- aiecs/domain/agent/__init__.py +189 -0
- aiecs/domain/agent/base_agent.py +697 -0
- aiecs/domain/agent/exceptions.py +103 -0
- aiecs/domain/agent/graph_aware_mixin.py +559 -0
- aiecs/domain/agent/hybrid_agent.py +490 -0
- aiecs/domain/agent/integration/__init__.py +26 -0
- aiecs/domain/agent/integration/context_compressor.py +222 -0
- aiecs/domain/agent/integration/context_engine_adapter.py +252 -0
- aiecs/domain/agent/integration/retry_policy.py +219 -0
- aiecs/domain/agent/integration/role_config.py +213 -0
- aiecs/domain/agent/knowledge_aware_agent.py +646 -0
- aiecs/domain/agent/lifecycle.py +296 -0
- aiecs/domain/agent/llm_agent.py +300 -0
- aiecs/domain/agent/memory/__init__.py +12 -0
- aiecs/domain/agent/memory/conversation.py +197 -0
- aiecs/domain/agent/migration/__init__.py +14 -0
- aiecs/domain/agent/migration/conversion.py +160 -0
- aiecs/domain/agent/migration/legacy_wrapper.py +90 -0
- aiecs/domain/agent/models.py +317 -0
- aiecs/domain/agent/observability.py +407 -0
- aiecs/domain/agent/persistence.py +289 -0
- aiecs/domain/agent/prompts/__init__.py +29 -0
- aiecs/domain/agent/prompts/builder.py +161 -0
- aiecs/domain/agent/prompts/formatters.py +189 -0
- aiecs/domain/agent/prompts/template.py +255 -0
- aiecs/domain/agent/registry.py +260 -0
- aiecs/domain/agent/tool_agent.py +257 -0
- aiecs/domain/agent/tools/__init__.py +12 -0
- aiecs/domain/agent/tools/schema_generator.py +221 -0
- aiecs/domain/community/__init__.py +155 -0
- aiecs/domain/community/agent_adapter.py +477 -0
- aiecs/domain/community/analytics.py +481 -0
- aiecs/domain/community/collaborative_workflow.py +642 -0
- aiecs/domain/community/communication_hub.py +645 -0
- aiecs/domain/community/community_builder.py +320 -0
- aiecs/domain/community/community_integration.py +800 -0
- aiecs/domain/community/community_manager.py +813 -0
- aiecs/domain/community/decision_engine.py +879 -0
- aiecs/domain/community/exceptions.py +225 -0
- aiecs/domain/community/models/__init__.py +33 -0
- aiecs/domain/community/models/community_models.py +268 -0
- aiecs/domain/community/resource_manager.py +457 -0
- aiecs/domain/community/shared_context_manager.py +603 -0
- aiecs/domain/context/__init__.py +58 -0
- aiecs/domain/context/context_engine.py +989 -0
- aiecs/domain/context/conversation_models.py +354 -0
- aiecs/domain/context/graph_memory.py +467 -0
- aiecs/domain/execution/__init__.py +12 -0
- aiecs/domain/execution/model.py +57 -0
- aiecs/domain/knowledge_graph/__init__.py +19 -0
- aiecs/domain/knowledge_graph/models/__init__.py +52 -0
- aiecs/domain/knowledge_graph/models/entity.py +130 -0
- aiecs/domain/knowledge_graph/models/evidence.py +194 -0
- aiecs/domain/knowledge_graph/models/inference_rule.py +186 -0
- aiecs/domain/knowledge_graph/models/path.py +179 -0
- aiecs/domain/knowledge_graph/models/path_pattern.py +173 -0
- aiecs/domain/knowledge_graph/models/query.py +272 -0
- aiecs/domain/knowledge_graph/models/query_plan.py +187 -0
- aiecs/domain/knowledge_graph/models/relation.py +136 -0
- aiecs/domain/knowledge_graph/schema/__init__.py +23 -0
- aiecs/domain/knowledge_graph/schema/entity_type.py +135 -0
- aiecs/domain/knowledge_graph/schema/graph_schema.py +271 -0
- aiecs/domain/knowledge_graph/schema/property_schema.py +155 -0
- aiecs/domain/knowledge_graph/schema/relation_type.py +171 -0
- aiecs/domain/knowledge_graph/schema/schema_manager.py +496 -0
- aiecs/domain/knowledge_graph/schema/type_enums.py +205 -0
- aiecs/domain/task/__init__.py +13 -0
- aiecs/domain/task/dsl_processor.py +613 -0
- aiecs/domain/task/model.py +62 -0
- aiecs/domain/task/task_context.py +268 -0
- aiecs/infrastructure/__init__.py +24 -0
- aiecs/infrastructure/graph_storage/__init__.py +11 -0
- aiecs/infrastructure/graph_storage/base.py +601 -0
- aiecs/infrastructure/graph_storage/batch_operations.py +449 -0
- aiecs/infrastructure/graph_storage/cache.py +429 -0
- aiecs/infrastructure/graph_storage/distributed.py +226 -0
- aiecs/infrastructure/graph_storage/error_handling.py +390 -0
- aiecs/infrastructure/graph_storage/graceful_degradation.py +306 -0
- aiecs/infrastructure/graph_storage/health_checks.py +378 -0
- aiecs/infrastructure/graph_storage/in_memory.py +514 -0
- aiecs/infrastructure/graph_storage/index_optimization.py +483 -0
- aiecs/infrastructure/graph_storage/lazy_loading.py +410 -0
- aiecs/infrastructure/graph_storage/metrics.py +357 -0
- aiecs/infrastructure/graph_storage/migration.py +413 -0
- aiecs/infrastructure/graph_storage/pagination.py +471 -0
- aiecs/infrastructure/graph_storage/performance_monitoring.py +466 -0
- aiecs/infrastructure/graph_storage/postgres.py +871 -0
- aiecs/infrastructure/graph_storage/query_optimizer.py +635 -0
- aiecs/infrastructure/graph_storage/schema_cache.py +290 -0
- aiecs/infrastructure/graph_storage/sqlite.py +623 -0
- aiecs/infrastructure/graph_storage/streaming.py +495 -0
- aiecs/infrastructure/messaging/__init__.py +13 -0
- aiecs/infrastructure/messaging/celery_task_manager.py +383 -0
- aiecs/infrastructure/messaging/websocket_manager.py +298 -0
- aiecs/infrastructure/monitoring/__init__.py +34 -0
- aiecs/infrastructure/monitoring/executor_metrics.py +174 -0
- aiecs/infrastructure/monitoring/global_metrics_manager.py +213 -0
- aiecs/infrastructure/monitoring/structured_logger.py +48 -0
- aiecs/infrastructure/monitoring/tracing_manager.py +410 -0
- aiecs/infrastructure/persistence/__init__.py +24 -0
- aiecs/infrastructure/persistence/context_engine_client.py +187 -0
- aiecs/infrastructure/persistence/database_manager.py +333 -0
- aiecs/infrastructure/persistence/file_storage.py +754 -0
- aiecs/infrastructure/persistence/redis_client.py +220 -0
- aiecs/llm/__init__.py +86 -0
- aiecs/llm/callbacks/__init__.py +11 -0
- aiecs/llm/callbacks/custom_callbacks.py +264 -0
- aiecs/llm/client_factory.py +420 -0
- aiecs/llm/clients/__init__.py +33 -0
- aiecs/llm/clients/base_client.py +193 -0
- aiecs/llm/clients/googleai_client.py +181 -0
- aiecs/llm/clients/openai_client.py +131 -0
- aiecs/llm/clients/vertex_client.py +437 -0
- aiecs/llm/clients/xai_client.py +184 -0
- aiecs/llm/config/__init__.py +51 -0
- aiecs/llm/config/config_loader.py +275 -0
- aiecs/llm/config/config_validator.py +236 -0
- aiecs/llm/config/model_config.py +151 -0
- aiecs/llm/utils/__init__.py +10 -0
- aiecs/llm/utils/validate_config.py +91 -0
- aiecs/main.py +363 -0
- aiecs/scripts/__init__.py +3 -0
- aiecs/scripts/aid/VERSION_MANAGEMENT.md +97 -0
- aiecs/scripts/aid/__init__.py +19 -0
- aiecs/scripts/aid/version_manager.py +215 -0
- aiecs/scripts/dependance_check/DEPENDENCY_SYSTEM_SUMMARY.md +242 -0
- aiecs/scripts/dependance_check/README_DEPENDENCY_CHECKER.md +310 -0
- aiecs/scripts/dependance_check/__init__.py +17 -0
- aiecs/scripts/dependance_check/dependency_checker.py +938 -0
- aiecs/scripts/dependance_check/dependency_fixer.py +391 -0
- aiecs/scripts/dependance_check/download_nlp_data.py +396 -0
- aiecs/scripts/dependance_check/quick_dependency_check.py +270 -0
- aiecs/scripts/dependance_check/setup_nlp_data.sh +217 -0
- aiecs/scripts/dependance_patch/__init__.py +7 -0
- aiecs/scripts/dependance_patch/fix_weasel/README_WEASEL_PATCH.md +126 -0
- aiecs/scripts/dependance_patch/fix_weasel/__init__.py +11 -0
- aiecs/scripts/dependance_patch/fix_weasel/fix_weasel_validator.py +128 -0
- aiecs/scripts/dependance_patch/fix_weasel/fix_weasel_validator.sh +82 -0
- aiecs/scripts/dependance_patch/fix_weasel/patch_weasel_library.sh +188 -0
- aiecs/scripts/dependance_patch/fix_weasel/run_weasel_patch.sh +41 -0
- aiecs/scripts/tools_develop/README.md +449 -0
- aiecs/scripts/tools_develop/TOOL_AUTO_DISCOVERY.md +234 -0
- aiecs/scripts/tools_develop/__init__.py +21 -0
- aiecs/scripts/tools_develop/check_type_annotations.py +259 -0
- aiecs/scripts/tools_develop/validate_tool_schemas.py +422 -0
- aiecs/scripts/tools_develop/verify_tools.py +356 -0
- aiecs/tasks/__init__.py +1 -0
- aiecs/tasks/worker.py +172 -0
- aiecs/tools/__init__.py +299 -0
- aiecs/tools/apisource/__init__.py +99 -0
- aiecs/tools/apisource/intelligence/__init__.py +19 -0
- aiecs/tools/apisource/intelligence/data_fusion.py +381 -0
- aiecs/tools/apisource/intelligence/query_analyzer.py +413 -0
- aiecs/tools/apisource/intelligence/search_enhancer.py +388 -0
- aiecs/tools/apisource/monitoring/__init__.py +9 -0
- aiecs/tools/apisource/monitoring/metrics.py +303 -0
- aiecs/tools/apisource/providers/__init__.py +115 -0
- aiecs/tools/apisource/providers/base.py +664 -0
- aiecs/tools/apisource/providers/census.py +401 -0
- aiecs/tools/apisource/providers/fred.py +564 -0
- aiecs/tools/apisource/providers/newsapi.py +412 -0
- aiecs/tools/apisource/providers/worldbank.py +357 -0
- aiecs/tools/apisource/reliability/__init__.py +12 -0
- aiecs/tools/apisource/reliability/error_handler.py +375 -0
- aiecs/tools/apisource/reliability/fallback_strategy.py +391 -0
- aiecs/tools/apisource/tool.py +850 -0
- aiecs/tools/apisource/utils/__init__.py +9 -0
- aiecs/tools/apisource/utils/validators.py +338 -0
- aiecs/tools/base_tool.py +201 -0
- aiecs/tools/docs/__init__.py +121 -0
- aiecs/tools/docs/ai_document_orchestrator.py +599 -0
- aiecs/tools/docs/ai_document_writer_orchestrator.py +2403 -0
- aiecs/tools/docs/content_insertion_tool.py +1333 -0
- aiecs/tools/docs/document_creator_tool.py +1317 -0
- aiecs/tools/docs/document_layout_tool.py +1166 -0
- aiecs/tools/docs/document_parser_tool.py +994 -0
- aiecs/tools/docs/document_writer_tool.py +1818 -0
- aiecs/tools/knowledge_graph/__init__.py +17 -0
- aiecs/tools/knowledge_graph/graph_reasoning_tool.py +734 -0
- aiecs/tools/knowledge_graph/graph_search_tool.py +923 -0
- aiecs/tools/knowledge_graph/kg_builder_tool.py +476 -0
- aiecs/tools/langchain_adapter.py +542 -0
- aiecs/tools/schema_generator.py +275 -0
- aiecs/tools/search_tool/__init__.py +100 -0
- aiecs/tools/search_tool/analyzers.py +589 -0
- aiecs/tools/search_tool/cache.py +260 -0
- aiecs/tools/search_tool/constants.py +128 -0
- aiecs/tools/search_tool/context.py +216 -0
- aiecs/tools/search_tool/core.py +749 -0
- aiecs/tools/search_tool/deduplicator.py +123 -0
- aiecs/tools/search_tool/error_handler.py +271 -0
- aiecs/tools/search_tool/metrics.py +371 -0
- aiecs/tools/search_tool/rate_limiter.py +178 -0
- aiecs/tools/search_tool/schemas.py +277 -0
- aiecs/tools/statistics/__init__.py +80 -0
- aiecs/tools/statistics/ai_data_analysis_orchestrator.py +643 -0
- aiecs/tools/statistics/ai_insight_generator_tool.py +505 -0
- aiecs/tools/statistics/ai_report_orchestrator_tool.py +694 -0
- aiecs/tools/statistics/data_loader_tool.py +564 -0
- aiecs/tools/statistics/data_profiler_tool.py +658 -0
- aiecs/tools/statistics/data_transformer_tool.py +573 -0
- aiecs/tools/statistics/data_visualizer_tool.py +495 -0
- aiecs/tools/statistics/model_trainer_tool.py +487 -0
- aiecs/tools/statistics/statistical_analyzer_tool.py +459 -0
- aiecs/tools/task_tools/__init__.py +86 -0
- aiecs/tools/task_tools/chart_tool.py +732 -0
- aiecs/tools/task_tools/classfire_tool.py +922 -0
- aiecs/tools/task_tools/image_tool.py +447 -0
- aiecs/tools/task_tools/office_tool.py +684 -0
- aiecs/tools/task_tools/pandas_tool.py +635 -0
- aiecs/tools/task_tools/report_tool.py +635 -0
- aiecs/tools/task_tools/research_tool.py +392 -0
- aiecs/tools/task_tools/scraper_tool.py +715 -0
- aiecs/tools/task_tools/stats_tool.py +688 -0
- aiecs/tools/temp_file_manager.py +130 -0
- aiecs/tools/tool_executor/__init__.py +37 -0
- aiecs/tools/tool_executor/tool_executor.py +881 -0
- aiecs/utils/LLM_output_structor.py +445 -0
- aiecs/utils/__init__.py +34 -0
- aiecs/utils/base_callback.py +47 -0
- aiecs/utils/cache_provider.py +695 -0
- aiecs/utils/execution_utils.py +184 -0
- aiecs/utils/logging.py +1 -0
- aiecs/utils/prompt_loader.py +14 -0
- aiecs/utils/token_usage_repository.py +323 -0
- aiecs/ws/__init__.py +0 -0
- aiecs/ws/socket_server.py +52 -0
- aiecs-1.5.1.dist-info/METADATA +608 -0
- aiecs-1.5.1.dist-info/RECORD +302 -0
- aiecs-1.5.1.dist-info/WHEEL +5 -0
- aiecs-1.5.1.dist-info/entry_points.txt +10 -0
- aiecs-1.5.1.dist-info/licenses/LICENSE +225 -0
- aiecs-1.5.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,319 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Text Chunker
|
|
3
|
+
|
|
4
|
+
Splits large texts into manageable chunks for processing.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from typing import List, Optional, Dict, Any
|
|
8
|
+
from dataclasses import dataclass, field
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class TextChunk:
|
|
13
|
+
"""
|
|
14
|
+
A chunk of text with metadata
|
|
15
|
+
|
|
16
|
+
Attributes:
|
|
17
|
+
text: The chunk text content
|
|
18
|
+
start_char: Starting character position in original text
|
|
19
|
+
end_char: Ending character position in original text
|
|
20
|
+
chunk_index: Index of this chunk (0-based)
|
|
21
|
+
metadata: Optional metadata about this chunk
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
text: str
|
|
25
|
+
start_char: int
|
|
26
|
+
end_char: int
|
|
27
|
+
chunk_index: int
|
|
28
|
+
metadata: Dict[str, Any] = field(default_factory=dict)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class TextChunker:
|
|
32
|
+
"""
|
|
33
|
+
Split large texts into smaller chunks
|
|
34
|
+
|
|
35
|
+
Strategies:
|
|
36
|
+
- Fixed size chunking (by character or token count)
|
|
37
|
+
- Sentence-aware chunking (don't break sentences)
|
|
38
|
+
- Paragraph-aware chunking (preserve paragraphs)
|
|
39
|
+
- Overlapping chunks (for context preservation)
|
|
40
|
+
|
|
41
|
+
Example:
|
|
42
|
+
```python
|
|
43
|
+
chunker = TextChunker(chunk_size=1000, overlap=100)
|
|
44
|
+
chunks = chunker.chunk_text(long_document)
|
|
45
|
+
|
|
46
|
+
for chunk in chunks:
|
|
47
|
+
# Process each chunk separately
|
|
48
|
+
result = await process(chunk.text)
|
|
49
|
+
```
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
def __init__(
|
|
53
|
+
self,
|
|
54
|
+
chunk_size: int = 1000,
|
|
55
|
+
overlap: int = 100,
|
|
56
|
+
respect_sentences: bool = True,
|
|
57
|
+
respect_paragraphs: bool = False,
|
|
58
|
+
min_chunk_size: int = 100,
|
|
59
|
+
):
|
|
60
|
+
"""
|
|
61
|
+
Initialize text chunker
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
chunk_size: Target size for each chunk (in characters)
|
|
65
|
+
overlap: Number of characters to overlap between chunks
|
|
66
|
+
respect_sentences: Try to break at sentence boundaries
|
|
67
|
+
respect_paragraphs: Try to break at paragraph boundaries
|
|
68
|
+
min_chunk_size: Minimum chunk size (don't create tiny chunks)
|
|
69
|
+
"""
|
|
70
|
+
self.chunk_size = chunk_size
|
|
71
|
+
self.overlap = overlap
|
|
72
|
+
self.respect_sentences = respect_sentences
|
|
73
|
+
self.respect_paragraphs = respect_paragraphs
|
|
74
|
+
self.min_chunk_size = min_chunk_size
|
|
75
|
+
|
|
76
|
+
def chunk_text(self, text: str, metadata: Optional[Dict[str, Any]] = None) -> List[TextChunk]:
|
|
77
|
+
"""
|
|
78
|
+
Split text into chunks
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
text: Text to chunk
|
|
82
|
+
metadata: Optional metadata to attach to chunks
|
|
83
|
+
|
|
84
|
+
Returns:
|
|
85
|
+
List of TextChunk objects
|
|
86
|
+
"""
|
|
87
|
+
if not text:
|
|
88
|
+
return []
|
|
89
|
+
|
|
90
|
+
# Handle short texts
|
|
91
|
+
if len(text) <= self.chunk_size:
|
|
92
|
+
return [
|
|
93
|
+
TextChunk(
|
|
94
|
+
text=text,
|
|
95
|
+
start_char=0,
|
|
96
|
+
end_char=len(text),
|
|
97
|
+
chunk_index=0,
|
|
98
|
+
metadata=metadata,
|
|
99
|
+
)
|
|
100
|
+
]
|
|
101
|
+
|
|
102
|
+
# Choose chunking strategy
|
|
103
|
+
if self.respect_paragraphs:
|
|
104
|
+
return self._chunk_by_paragraphs(text, metadata)
|
|
105
|
+
elif self.respect_sentences:
|
|
106
|
+
return self._chunk_by_sentences(text, metadata)
|
|
107
|
+
else:
|
|
108
|
+
return self._chunk_fixed_size(text, metadata)
|
|
109
|
+
|
|
110
|
+
def _chunk_fixed_size(self, text: str, metadata: Optional[Dict[str, Any]]) -> List[TextChunk]:
|
|
111
|
+
"""
|
|
112
|
+
Chunk text by fixed size with overlap
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
text: Text to chunk
|
|
116
|
+
metadata: Optional metadata
|
|
117
|
+
|
|
118
|
+
Returns:
|
|
119
|
+
List of TextChunk objects
|
|
120
|
+
"""
|
|
121
|
+
chunks = []
|
|
122
|
+
start = 0
|
|
123
|
+
chunk_index = 0
|
|
124
|
+
|
|
125
|
+
while start < len(text):
|
|
126
|
+
end = min(start + self.chunk_size, len(text))
|
|
127
|
+
|
|
128
|
+
chunk = TextChunk(
|
|
129
|
+
text=text[start:end],
|
|
130
|
+
start_char=start,
|
|
131
|
+
end_char=end,
|
|
132
|
+
chunk_index=chunk_index,
|
|
133
|
+
metadata=metadata,
|
|
134
|
+
)
|
|
135
|
+
chunks.append(chunk)
|
|
136
|
+
|
|
137
|
+
# Move to next chunk with overlap
|
|
138
|
+
start += self.chunk_size - self.overlap
|
|
139
|
+
chunk_index += 1
|
|
140
|
+
|
|
141
|
+
return chunks
|
|
142
|
+
|
|
143
|
+
def _chunk_by_sentences(self, text: str, metadata: Optional[Dict[str, Any]]) -> List[TextChunk]:
|
|
144
|
+
"""
|
|
145
|
+
Chunk text respecting sentence boundaries
|
|
146
|
+
|
|
147
|
+
Args:
|
|
148
|
+
text: Text to chunk
|
|
149
|
+
metadata: Optional metadata
|
|
150
|
+
|
|
151
|
+
Returns:
|
|
152
|
+
List of TextChunk objects
|
|
153
|
+
"""
|
|
154
|
+
# Simple sentence splitting (can be improved with NLTK/spaCy)
|
|
155
|
+
sentences = self._split_sentences(text)
|
|
156
|
+
|
|
157
|
+
chunks: List[TextChunk] = []
|
|
158
|
+
current_chunk: List[str] = []
|
|
159
|
+
current_length = 0
|
|
160
|
+
current_start = 0
|
|
161
|
+
chunk_index = 0
|
|
162
|
+
|
|
163
|
+
for sent in sentences:
|
|
164
|
+
sent_length = len(sent)
|
|
165
|
+
|
|
166
|
+
# If adding this sentence would exceed chunk_size
|
|
167
|
+
if current_length + sent_length > self.chunk_size and current_chunk:
|
|
168
|
+
# Finalize current chunk
|
|
169
|
+
chunk_text = " ".join(current_chunk)
|
|
170
|
+
chunk_end = current_start + len(chunk_text)
|
|
171
|
+
|
|
172
|
+
chunks.append(
|
|
173
|
+
TextChunk(
|
|
174
|
+
text=chunk_text,
|
|
175
|
+
start_char=current_start,
|
|
176
|
+
end_char=chunk_end,
|
|
177
|
+
chunk_index=chunk_index,
|
|
178
|
+
metadata=metadata,
|
|
179
|
+
)
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
# Start new chunk with overlap (last few sentences)
|
|
183
|
+
overlap_sentences: List[str] = self._get_overlap_sentences(current_chunk)
|
|
184
|
+
current_chunk = overlap_sentences
|
|
185
|
+
current_length = sum(len(s) + 1 for s in current_chunk) # +1 for spaces
|
|
186
|
+
current_start = chunk_end - current_length
|
|
187
|
+
chunk_index += 1
|
|
188
|
+
|
|
189
|
+
current_chunk.append(sent)
|
|
190
|
+
current_length += sent_length + 1 # +1 for space
|
|
191
|
+
|
|
192
|
+
# Add final chunk
|
|
193
|
+
if current_chunk:
|
|
194
|
+
chunk_text = " ".join(current_chunk)
|
|
195
|
+
chunks.append(
|
|
196
|
+
TextChunk(
|
|
197
|
+
text=chunk_text,
|
|
198
|
+
start_char=current_start,
|
|
199
|
+
end_char=len(text),
|
|
200
|
+
chunk_index=chunk_index,
|
|
201
|
+
metadata=metadata,
|
|
202
|
+
)
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
return chunks
|
|
206
|
+
|
|
207
|
+
def _chunk_by_paragraphs(
|
|
208
|
+
self, text: str, metadata: Optional[Dict[str, Any]]
|
|
209
|
+
) -> List[TextChunk]:
|
|
210
|
+
"""
|
|
211
|
+
Chunk text respecting paragraph boundaries
|
|
212
|
+
|
|
213
|
+
Args:
|
|
214
|
+
text: Text to chunk
|
|
215
|
+
metadata: Optional metadata
|
|
216
|
+
|
|
217
|
+
Returns:
|
|
218
|
+
List of TextChunk objects
|
|
219
|
+
"""
|
|
220
|
+
# Split by double newlines (paragraphs)
|
|
221
|
+
paragraphs = text.split("\n\n")
|
|
222
|
+
|
|
223
|
+
chunks: List[TextChunk] = []
|
|
224
|
+
current_chunk: List[str] = []
|
|
225
|
+
current_length = 0
|
|
226
|
+
current_start = 0
|
|
227
|
+
chunk_index = 0
|
|
228
|
+
|
|
229
|
+
for para in paragraphs:
|
|
230
|
+
para = para.strip()
|
|
231
|
+
if not para:
|
|
232
|
+
continue
|
|
233
|
+
|
|
234
|
+
para_length = len(para)
|
|
235
|
+
|
|
236
|
+
# If adding this paragraph would exceed chunk_size
|
|
237
|
+
if current_length + para_length > self.chunk_size and current_chunk:
|
|
238
|
+
# Finalize current chunk
|
|
239
|
+
chunk_text = "\n\n".join(current_chunk)
|
|
240
|
+
chunk_end = current_start + len(chunk_text)
|
|
241
|
+
|
|
242
|
+
chunks.append(
|
|
243
|
+
TextChunk(
|
|
244
|
+
text=chunk_text,
|
|
245
|
+
start_char=current_start,
|
|
246
|
+
end_char=chunk_end,
|
|
247
|
+
chunk_index=chunk_index,
|
|
248
|
+
metadata=metadata,
|
|
249
|
+
)
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
# Start new chunk
|
|
253
|
+
current_chunk = []
|
|
254
|
+
current_length = 0
|
|
255
|
+
current_start = chunk_end
|
|
256
|
+
chunk_index += 1
|
|
257
|
+
|
|
258
|
+
current_chunk.append(para)
|
|
259
|
+
current_length += para_length + 2 # +2 for \n\n
|
|
260
|
+
|
|
261
|
+
# Add final chunk
|
|
262
|
+
if current_chunk:
|
|
263
|
+
chunk_text = "\n\n".join(current_chunk)
|
|
264
|
+
chunks.append(
|
|
265
|
+
TextChunk(
|
|
266
|
+
text=chunk_text,
|
|
267
|
+
start_char=current_start,
|
|
268
|
+
end_char=len(text),
|
|
269
|
+
chunk_index=chunk_index,
|
|
270
|
+
metadata=metadata,
|
|
271
|
+
)
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
return chunks
|
|
275
|
+
|
|
276
|
+
def _split_sentences(self, text: str) -> List[str]:
|
|
277
|
+
"""
|
|
278
|
+
Split text into sentences (simple implementation)
|
|
279
|
+
|
|
280
|
+
For production, consider using NLTK's sent_tokenize or spaCy.
|
|
281
|
+
|
|
282
|
+
Args:
|
|
283
|
+
text: Text to split
|
|
284
|
+
|
|
285
|
+
Returns:
|
|
286
|
+
List of sentences
|
|
287
|
+
"""
|
|
288
|
+
import re
|
|
289
|
+
|
|
290
|
+
# Simple sentence splitting by period, question mark, exclamation
|
|
291
|
+
# This is a basic implementation - can be improved
|
|
292
|
+
sentences = re.split(r"(?<=[.!?])\s+", text)
|
|
293
|
+
return [s.strip() for s in sentences if s.strip()]
|
|
294
|
+
|
|
295
|
+
def _get_overlap_sentences(self, sentences: List[str]) -> List[str]:
|
|
296
|
+
"""
|
|
297
|
+
Get last few sentences for overlap
|
|
298
|
+
|
|
299
|
+
Args:
|
|
300
|
+
sentences: List of sentences
|
|
301
|
+
|
|
302
|
+
Returns:
|
|
303
|
+
Last few sentences that fit in overlap size
|
|
304
|
+
"""
|
|
305
|
+
if not sentences or self.overlap == 0:
|
|
306
|
+
return []
|
|
307
|
+
|
|
308
|
+
overlap_sentences = []
|
|
309
|
+
overlap_length = 0
|
|
310
|
+
|
|
311
|
+
# Take sentences from end until we reach overlap size
|
|
312
|
+
for sent in reversed(sentences):
|
|
313
|
+
if overlap_length + len(sent) + 1 <= self.overlap:
|
|
314
|
+
overlap_sentences.insert(0, sent)
|
|
315
|
+
overlap_length += len(sent) + 1
|
|
316
|
+
else:
|
|
317
|
+
break
|
|
318
|
+
|
|
319
|
+
return overlap_sentences
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Knowledge Graph Entity and Relation Extractors
|
|
3
|
+
|
|
4
|
+
This module provides extractors for building knowledge graphs from text.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from aiecs.application.knowledge_graph.extractors.base import (
|
|
8
|
+
EntityExtractor,
|
|
9
|
+
RelationExtractor,
|
|
10
|
+
)
|
|
11
|
+
from aiecs.application.knowledge_graph.extractors.llm_entity_extractor import (
|
|
12
|
+
LLMEntityExtractor,
|
|
13
|
+
)
|
|
14
|
+
from aiecs.application.knowledge_graph.extractors.ner_entity_extractor import (
|
|
15
|
+
NEREntityExtractor,
|
|
16
|
+
)
|
|
17
|
+
from aiecs.application.knowledge_graph.extractors.llm_relation_extractor import (
|
|
18
|
+
LLMRelationExtractor,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
__all__ = [
|
|
22
|
+
"EntityExtractor",
|
|
23
|
+
"RelationExtractor",
|
|
24
|
+
"LLMEntityExtractor",
|
|
25
|
+
"NEREntityExtractor",
|
|
26
|
+
"LLMRelationExtractor",
|
|
27
|
+
]
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Base Abstract Classes for Entity and Relation Extraction
|
|
3
|
+
|
|
4
|
+
Defines the interface for extracting entities and relations from text.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from abc import ABC, abstractmethod
|
|
8
|
+
from typing import List, Optional
|
|
9
|
+
from aiecs.domain.knowledge_graph.models.entity import Entity
|
|
10
|
+
from aiecs.domain.knowledge_graph.models.relation import Relation
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class EntityExtractor(ABC):
|
|
14
|
+
"""
|
|
15
|
+
Abstract base class for entity extraction
|
|
16
|
+
|
|
17
|
+
Entity extractors take text input and return a list of entities found in the text.
|
|
18
|
+
Different implementations can use different methods (LLM, NER, rule-based, etc.).
|
|
19
|
+
|
|
20
|
+
Example:
|
|
21
|
+
```python
|
|
22
|
+
extractor = LLMEntityExtractor(llm_client, schema)
|
|
23
|
+
entities = await extractor.extract_entities(
|
|
24
|
+
"Alice works at Tech Corp in San Francisco"
|
|
25
|
+
)
|
|
26
|
+
# Returns: [Entity(Person: Alice), Entity(Company: Tech Corp), ...]
|
|
27
|
+
```
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
@abstractmethod
|
|
31
|
+
async def extract_entities(
|
|
32
|
+
self, text: str, entity_types: Optional[List[str]] = None, **kwargs
|
|
33
|
+
) -> List[Entity]:
|
|
34
|
+
"""
|
|
35
|
+
Extract entities from text
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
text: Input text to extract entities from
|
|
39
|
+
entity_types: Optional list of entity types to extract (e.g., ["Person", "Company"])
|
|
40
|
+
If None, extract all types supported by the extractor
|
|
41
|
+
**kwargs: Additional extractor-specific parameters
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
List of Entity objects found in the text
|
|
45
|
+
|
|
46
|
+
Raises:
|
|
47
|
+
ValueError: If text is empty or invalid
|
|
48
|
+
RuntimeError: If extraction fails
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class RelationExtractor(ABC):
|
|
53
|
+
"""
|
|
54
|
+
Abstract base class for relation extraction
|
|
55
|
+
|
|
56
|
+
Relation extractors take text and a list of entities, and return relations
|
|
57
|
+
(edges) between those entities. This is a two-stage extraction process:
|
|
58
|
+
entities must be extracted first, then relations between them.
|
|
59
|
+
|
|
60
|
+
Example:
|
|
61
|
+
```python
|
|
62
|
+
extractor = LLMRelationExtractor(llm_client, schema)
|
|
63
|
+
|
|
64
|
+
# Entities already extracted
|
|
65
|
+
alice = Entity(id="e1", entity_type="Person", properties={"name": "Alice"})
|
|
66
|
+
tech_corp = Entity(id="e2", entity_type="Company", properties={"name": "Tech Corp"})
|
|
67
|
+
|
|
68
|
+
relations = await extractor.extract_relations(
|
|
69
|
+
text="Alice works at Tech Corp",
|
|
70
|
+
entities=[alice, tech_corp]
|
|
71
|
+
)
|
|
72
|
+
# Returns: [Relation(alice -[WORKS_FOR]-> tech_corp)]
|
|
73
|
+
```
|
|
74
|
+
"""
|
|
75
|
+
|
|
76
|
+
@abstractmethod
|
|
77
|
+
async def extract_relations(
|
|
78
|
+
self,
|
|
79
|
+
text: str,
|
|
80
|
+
entities: List[Entity],
|
|
81
|
+
relation_types: Optional[List[str]] = None,
|
|
82
|
+
**kwargs,
|
|
83
|
+
) -> List[Relation]:
|
|
84
|
+
"""
|
|
85
|
+
Extract relations from text given known entities
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
text: Input text containing the entities
|
|
89
|
+
entities: List of entities already extracted from this text
|
|
90
|
+
relation_types: Optional list of relation types to extract (e.g., ["WORKS_FOR", "KNOWS"])
|
|
91
|
+
If None, extract all types supported by the extractor
|
|
92
|
+
**kwargs: Additional extractor-specific parameters
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
List of Relation objects found between the entities
|
|
96
|
+
|
|
97
|
+
Raises:
|
|
98
|
+
ValueError: If text is empty or entities list is empty
|
|
99
|
+
RuntimeError: If extraction fails
|
|
100
|
+
"""
|