cognee 0.3.6__py3-none-any.whl → 0.3.7.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cognee/__init__.py +1 -0
- cognee/api/health.py +2 -12
- cognee/api/v1/add/add.py +46 -6
- cognee/api/v1/add/routers/get_add_router.py +11 -2
- cognee/api/v1/cognify/cognify.py +29 -9
- cognee/api/v1/cognify/routers/get_cognify_router.py +2 -1
- cognee/api/v1/datasets/datasets.py +11 -0
- cognee/api/v1/datasets/routers/get_datasets_router.py +8 -0
- cognee/api/v1/delete/routers/get_delete_router.py +2 -0
- cognee/api/v1/memify/routers/get_memify_router.py +2 -1
- cognee/api/v1/permissions/routers/get_permissions_router.py +6 -0
- cognee/api/v1/responses/default_tools.py +0 -1
- cognee/api/v1/responses/dispatch_function.py +1 -1
- cognee/api/v1/responses/routers/default_tools.py +0 -1
- cognee/api/v1/search/routers/get_search_router.py +3 -3
- cognee/api/v1/search/search.py +11 -9
- cognee/api/v1/settings/routers/get_settings_router.py +7 -1
- cognee/api/v1/sync/routers/get_sync_router.py +3 -0
- cognee/api/v1/ui/ui.py +45 -16
- cognee/api/v1/update/routers/get_update_router.py +3 -1
- cognee/api/v1/update/update.py +3 -3
- cognee/api/v1/users/routers/get_visualize_router.py +2 -0
- cognee/cli/_cognee.py +61 -10
- cognee/cli/commands/add_command.py +3 -3
- cognee/cli/commands/cognify_command.py +3 -3
- cognee/cli/commands/config_command.py +9 -7
- cognee/cli/commands/delete_command.py +3 -3
- cognee/cli/commands/search_command.py +3 -7
- cognee/cli/config.py +0 -1
- cognee/context_global_variables.py +5 -0
- cognee/exceptions/exceptions.py +1 -1
- cognee/infrastructure/databases/cache/__init__.py +2 -0
- cognee/infrastructure/databases/cache/cache_db_interface.py +79 -0
- cognee/infrastructure/databases/cache/config.py +44 -0
- cognee/infrastructure/databases/cache/get_cache_engine.py +67 -0
- cognee/infrastructure/databases/cache/redis/RedisAdapter.py +243 -0
- cognee/infrastructure/databases/exceptions/__init__.py +1 -0
- cognee/infrastructure/databases/exceptions/exceptions.py +18 -2
- cognee/infrastructure/databases/graph/get_graph_engine.py +1 -1
- cognee/infrastructure/databases/graph/graph_db_interface.py +5 -0
- cognee/infrastructure/databases/graph/kuzu/adapter.py +76 -47
- cognee/infrastructure/databases/graph/neo4j_driver/adapter.py +13 -3
- cognee/infrastructure/databases/graph/neo4j_driver/deadlock_retry.py +1 -1
- cognee/infrastructure/databases/graph/neptune_driver/neptune_utils.py +1 -1
- cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py +1 -1
- cognee/infrastructure/databases/vector/embeddings/FastembedEmbeddingEngine.py +21 -3
- cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py +17 -10
- cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py +17 -4
- cognee/infrastructure/databases/vector/embeddings/config.py +2 -3
- cognee/infrastructure/databases/vector/exceptions/exceptions.py +1 -1
- cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py +0 -1
- cognee/infrastructure/files/exceptions.py +1 -1
- cognee/infrastructure/files/storage/LocalFileStorage.py +9 -9
- cognee/infrastructure/files/storage/S3FileStorage.py +11 -11
- cognee/infrastructure/files/utils/guess_file_type.py +6 -0
- cognee/infrastructure/llm/prompts/feedback_reaction_prompt.txt +14 -0
- cognee/infrastructure/llm/prompts/feedback_report_prompt.txt +13 -0
- cognee/infrastructure/llm/prompts/feedback_user_context_prompt.txt +5 -0
- cognee/infrastructure/llm/prompts/search_type_selector_prompt.txt +0 -5
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic/adapter.py +19 -9
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini/adapter.py +17 -5
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/adapter.py +17 -5
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/get_llm_client.py +32 -0
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/mistral/__init__.py +0 -0
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/mistral/adapter.py +109 -0
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/ollama/adapter.py +33 -8
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/adapter.py +40 -18
- cognee/infrastructure/loaders/LoaderEngine.py +27 -7
- cognee/infrastructure/loaders/external/__init__.py +7 -0
- cognee/infrastructure/loaders/external/advanced_pdf_loader.py +2 -8
- cognee/infrastructure/loaders/external/beautiful_soup_loader.py +310 -0
- cognee/infrastructure/loaders/supported_loaders.py +7 -0
- cognee/modules/data/exceptions/exceptions.py +1 -1
- cognee/modules/data/methods/__init__.py +3 -0
- cognee/modules/data/methods/get_dataset_data.py +4 -1
- cognee/modules/data/methods/has_dataset_data.py +21 -0
- cognee/modules/engine/models/TableRow.py +0 -1
- cognee/modules/ingestion/save_data_to_file.py +9 -2
- cognee/modules/pipelines/exceptions/exceptions.py +1 -1
- cognee/modules/pipelines/operations/pipeline.py +12 -1
- cognee/modules/pipelines/operations/run_tasks.py +25 -197
- cognee/modules/pipelines/operations/run_tasks_base.py +7 -0
- cognee/modules/pipelines/operations/run_tasks_data_item.py +260 -0
- cognee/modules/pipelines/operations/run_tasks_distributed.py +121 -38
- cognee/modules/pipelines/operations/run_tasks_with_telemetry.py +9 -1
- cognee/modules/retrieval/EntityCompletionRetriever.py +48 -8
- cognee/modules/retrieval/base_graph_retriever.py +3 -1
- cognee/modules/retrieval/base_retriever.py +3 -1
- cognee/modules/retrieval/chunks_retriever.py +5 -1
- cognee/modules/retrieval/code_retriever.py +20 -2
- cognee/modules/retrieval/completion_retriever.py +50 -9
- cognee/modules/retrieval/cypher_search_retriever.py +11 -1
- cognee/modules/retrieval/graph_completion_context_extension_retriever.py +47 -8
- cognee/modules/retrieval/graph_completion_cot_retriever.py +152 -22
- cognee/modules/retrieval/graph_completion_retriever.py +54 -10
- cognee/modules/retrieval/lexical_retriever.py +20 -2
- cognee/modules/retrieval/natural_language_retriever.py +10 -1
- cognee/modules/retrieval/summaries_retriever.py +5 -1
- cognee/modules/retrieval/temporal_retriever.py +62 -10
- cognee/modules/retrieval/user_qa_feedback.py +3 -2
- cognee/modules/retrieval/utils/completion.py +30 -4
- cognee/modules/retrieval/utils/description_to_codepart_search.py +1 -1
- cognee/modules/retrieval/utils/session_cache.py +156 -0
- cognee/modules/search/methods/get_search_type_tools.py +0 -5
- cognee/modules/search/methods/no_access_control_search.py +12 -1
- cognee/modules/search/methods/search.py +51 -5
- cognee/modules/search/types/SearchType.py +0 -1
- cognee/modules/settings/get_settings.py +23 -0
- cognee/modules/users/methods/get_authenticated_user.py +3 -1
- cognee/modules/users/methods/get_default_user.py +1 -6
- cognee/modules/users/roles/methods/create_role.py +2 -2
- cognee/modules/users/tenants/methods/create_tenant.py +2 -2
- cognee/shared/exceptions/exceptions.py +1 -1
- cognee/shared/logging_utils.py +18 -11
- cognee/shared/utils.py +24 -2
- cognee/tasks/codingagents/coding_rule_associations.py +1 -2
- cognee/tasks/documents/exceptions/exceptions.py +1 -1
- cognee/tasks/feedback/__init__.py +13 -0
- cognee/tasks/feedback/create_enrichments.py +84 -0
- cognee/tasks/feedback/extract_feedback_interactions.py +230 -0
- cognee/tasks/feedback/generate_improved_answers.py +130 -0
- cognee/tasks/feedback/link_enrichments_to_feedback.py +67 -0
- cognee/tasks/feedback/models.py +26 -0
- cognee/tasks/graph/extract_graph_from_data.py +2 -0
- cognee/tasks/ingestion/data_item_to_text_file.py +3 -3
- cognee/tasks/ingestion/ingest_data.py +11 -5
- cognee/tasks/ingestion/save_data_item_to_storage.py +12 -1
- cognee/tasks/storage/add_data_points.py +3 -10
- cognee/tasks/storage/index_data_points.py +19 -14
- cognee/tasks/storage/index_graph_edges.py +25 -11
- cognee/tasks/web_scraper/__init__.py +34 -0
- cognee/tasks/web_scraper/config.py +26 -0
- cognee/tasks/web_scraper/default_url_crawler.py +446 -0
- cognee/tasks/web_scraper/models.py +46 -0
- cognee/tasks/web_scraper/types.py +4 -0
- cognee/tasks/web_scraper/utils.py +142 -0
- cognee/tasks/web_scraper/web_scraper_task.py +396 -0
- cognee/tests/cli_tests/cli_unit_tests/test_cli_utils.py +0 -1
- cognee/tests/integration/web_url_crawler/test_default_url_crawler.py +13 -0
- cognee/tests/integration/web_url_crawler/test_tavily_crawler.py +19 -0
- cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py +344 -0
- cognee/tests/subprocesses/reader.py +25 -0
- cognee/tests/subprocesses/simple_cognify_1.py +31 -0
- cognee/tests/subprocesses/simple_cognify_2.py +31 -0
- cognee/tests/subprocesses/writer.py +32 -0
- cognee/tests/tasks/descriptive_metrics/metrics_test_utils.py +0 -2
- cognee/tests/tasks/descriptive_metrics/neo4j_metrics_test.py +8 -3
- cognee/tests/tasks/entity_extraction/entity_extraction_test.py +89 -0
- cognee/tests/tasks/web_scraping/web_scraping_test.py +172 -0
- cognee/tests/test_add_docling_document.py +56 -0
- cognee/tests/test_chromadb.py +7 -11
- cognee/tests/test_concurrent_subprocess_access.py +76 -0
- cognee/tests/test_conversation_history.py +240 -0
- cognee/tests/test_feedback_enrichment.py +174 -0
- cognee/tests/test_kuzu.py +27 -15
- cognee/tests/test_lancedb.py +7 -11
- cognee/tests/test_library.py +32 -2
- cognee/tests/test_neo4j.py +24 -16
- cognee/tests/test_neptune_analytics_vector.py +7 -11
- cognee/tests/test_permissions.py +9 -13
- cognee/tests/test_pgvector.py +4 -4
- cognee/tests/test_remote_kuzu.py +8 -11
- cognee/tests/test_s3_file_storage.py +1 -1
- cognee/tests/test_search_db.py +6 -8
- cognee/tests/unit/infrastructure/databases/cache/test_cache_config.py +89 -0
- cognee/tests/unit/modules/retrieval/conversation_history_test.py +154 -0
- cognee/tests/unit/modules/retrieval/graph_completion_retriever_cot_test.py +51 -0
- {cognee-0.3.6.dist-info → cognee-0.3.7.dev1.dist-info}/METADATA +21 -6
- {cognee-0.3.6.dist-info → cognee-0.3.7.dev1.dist-info}/RECORD +178 -139
- {cognee-0.3.6.dist-info → cognee-0.3.7.dev1.dist-info}/entry_points.txt +1 -0
- distributed/Dockerfile +0 -3
- distributed/entrypoint.py +21 -9
- distributed/signal.py +5 -0
- distributed/workers/data_point_saving_worker.py +64 -34
- distributed/workers/graph_saving_worker.py +71 -47
- cognee/infrastructure/databases/graph/memgraph/memgraph_adapter.py +0 -1116
- cognee/modules/retrieval/insights_retriever.py +0 -133
- cognee/tests/test_memgraph.py +0 -109
- cognee/tests/unit/modules/retrieval/insights_retriever_test.py +0 -251
- {cognee-0.3.6.dist-info → cognee-0.3.7.dev1.dist-info}/WHEEL +0 -0
- {cognee-0.3.6.dist-info → cognee-0.3.7.dev1.dist-info}/licenses/LICENSE +0 -0
- {cognee-0.3.6.dist-info → cognee-0.3.7.dev1.dist-info}/licenses/NOTICE.md +0 -0
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import List
|
|
4
|
+
from uuid import NAMESPACE_OID, uuid5
|
|
5
|
+
|
|
6
|
+
from cognee.infrastructure.llm import LLMGateway
|
|
7
|
+
from cognee.infrastructure.llm.prompts.read_query_prompt import read_query_prompt
|
|
8
|
+
from cognee.shared.logging_utils import get_logger
|
|
9
|
+
from cognee.modules.engine.models import NodeSet
|
|
10
|
+
|
|
11
|
+
from .models import FeedbackEnrichment
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
logger = get_logger("create_enrichments")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _validate_enrichments(enrichments: List[FeedbackEnrichment]) -> bool:
|
|
18
|
+
"""Validate that all enrichments contain required fields for completion."""
|
|
19
|
+
return all(
|
|
20
|
+
enrichment.question is not None
|
|
21
|
+
and enrichment.original_answer is not None
|
|
22
|
+
and enrichment.improved_answer is not None
|
|
23
|
+
and enrichment.new_context is not None
|
|
24
|
+
and enrichment.feedback_id is not None
|
|
25
|
+
and enrichment.interaction_id is not None
|
|
26
|
+
for enrichment in enrichments
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
async def _generate_enrichment_report(
|
|
31
|
+
question: str, improved_answer: str, new_context: str, report_prompt_location: str
|
|
32
|
+
) -> str:
|
|
33
|
+
"""Generate educational report using feedback report prompt."""
|
|
34
|
+
try:
|
|
35
|
+
prompt_template = read_query_prompt(report_prompt_location)
|
|
36
|
+
rendered_prompt = prompt_template.format(
|
|
37
|
+
question=question,
|
|
38
|
+
improved_answer=improved_answer,
|
|
39
|
+
new_context=new_context,
|
|
40
|
+
)
|
|
41
|
+
return await LLMGateway.acreate_structured_output(
|
|
42
|
+
text_input=rendered_prompt,
|
|
43
|
+
system_prompt="You are a helpful assistant that creates educational content.",
|
|
44
|
+
response_model=str,
|
|
45
|
+
)
|
|
46
|
+
except Exception as exc:
|
|
47
|
+
logger.warning("Failed to generate enrichment report", error=str(exc), question=question)
|
|
48
|
+
return f"Educational content for: {question} - {improved_answer}"
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
async def create_enrichments(
|
|
52
|
+
enrichments: List[FeedbackEnrichment],
|
|
53
|
+
report_prompt_location: str = "feedback_report_prompt.txt",
|
|
54
|
+
) -> List[FeedbackEnrichment]:
|
|
55
|
+
"""Fill text and belongs_to_set fields of existing FeedbackEnrichment DataPoints."""
|
|
56
|
+
if not enrichments:
|
|
57
|
+
logger.info("No enrichments provided; returning empty list")
|
|
58
|
+
return []
|
|
59
|
+
|
|
60
|
+
if not _validate_enrichments(enrichments):
|
|
61
|
+
logger.error("Input validation failed; missing required fields")
|
|
62
|
+
return []
|
|
63
|
+
|
|
64
|
+
logger.info("Completing enrichments", count=len(enrichments))
|
|
65
|
+
|
|
66
|
+
nodeset = NodeSet(id=uuid5(NAMESPACE_OID, name="FeedbackEnrichment"), name="FeedbackEnrichment")
|
|
67
|
+
|
|
68
|
+
completed_enrichments: List[FeedbackEnrichment] = []
|
|
69
|
+
|
|
70
|
+
for enrichment in enrichments:
|
|
71
|
+
report_text = await _generate_enrichment_report(
|
|
72
|
+
enrichment.question,
|
|
73
|
+
enrichment.improved_answer,
|
|
74
|
+
enrichment.new_context,
|
|
75
|
+
report_prompt_location,
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
enrichment.text = report_text
|
|
79
|
+
enrichment.belongs_to_set = [nodeset]
|
|
80
|
+
|
|
81
|
+
completed_enrichments.append(enrichment)
|
|
82
|
+
|
|
83
|
+
logger.info("Completed enrichments", successful=len(completed_enrichments))
|
|
84
|
+
return completed_enrichments
|
|
@@ -0,0 +1,230 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
4
|
+
from uuid import UUID, uuid5, NAMESPACE_OID
|
|
5
|
+
|
|
6
|
+
from cognee.infrastructure.llm import LLMGateway
|
|
7
|
+
from cognee.infrastructure.llm.prompts.read_query_prompt import read_query_prompt
|
|
8
|
+
from cognee.shared.logging_utils import get_logger
|
|
9
|
+
from cognee.infrastructure.databases.graph import get_graph_engine
|
|
10
|
+
|
|
11
|
+
from .models import FeedbackEnrichment
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
logger = get_logger("extract_feedback_interactions")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _filter_negative_feedback(feedback_nodes):
|
|
18
|
+
"""Filter for negative sentiment feedback using precise sentiment classification."""
|
|
19
|
+
return [
|
|
20
|
+
(node_id, props)
|
|
21
|
+
for node_id, props in feedback_nodes
|
|
22
|
+
if (props.get("sentiment", "").casefold() == "negative" or props.get("score", 0) < 0)
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _get_normalized_id(node_id, props) -> str:
|
|
27
|
+
"""Return Cognee node id preference: props.id → props.node_id → raw node_id."""
|
|
28
|
+
return str(props.get("id") or props.get("node_id") or node_id)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
async def _fetch_feedback_and_interaction_graph_data() -> Tuple[List, List]:
|
|
32
|
+
"""Fetch feedback and interaction nodes with edges from graph engine."""
|
|
33
|
+
try:
|
|
34
|
+
graph_engine = await get_graph_engine()
|
|
35
|
+
attribute_filters = [{"type": ["CogneeUserFeedback", "CogneeUserInteraction"]}]
|
|
36
|
+
return await graph_engine.get_filtered_graph_data(attribute_filters)
|
|
37
|
+
except Exception as exc: # noqa: BLE001
|
|
38
|
+
logger.error("Failed to fetch filtered graph data", error=str(exc))
|
|
39
|
+
return [], []
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _separate_feedback_and_interaction_nodes(graph_nodes: List) -> Tuple[List, List]:
|
|
43
|
+
"""Split nodes into feedback and interaction groups by type field."""
|
|
44
|
+
feedback_nodes = [
|
|
45
|
+
(_get_normalized_id(node_id, props), props)
|
|
46
|
+
for node_id, props in graph_nodes
|
|
47
|
+
if props.get("type") == "CogneeUserFeedback"
|
|
48
|
+
]
|
|
49
|
+
interaction_nodes = [
|
|
50
|
+
(_get_normalized_id(node_id, props), props)
|
|
51
|
+
for node_id, props in graph_nodes
|
|
52
|
+
if props.get("type") == "CogneeUserInteraction"
|
|
53
|
+
]
|
|
54
|
+
return feedback_nodes, interaction_nodes
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _match_feedback_nodes_to_interactions_by_edges(
|
|
58
|
+
feedback_nodes: List, interaction_nodes: List, graph_edges: List
|
|
59
|
+
) -> List[Tuple[Tuple, Tuple]]:
|
|
60
|
+
"""Match feedback to interactions using gives_feedback_to edges."""
|
|
61
|
+
interaction_by_id = {node_id: (node_id, props) for node_id, props in interaction_nodes}
|
|
62
|
+
feedback_by_id = {node_id: (node_id, props) for node_id, props in feedback_nodes}
|
|
63
|
+
feedback_edges = [
|
|
64
|
+
(source_id, target_id)
|
|
65
|
+
for source_id, target_id, rel, _ in graph_edges
|
|
66
|
+
if rel == "gives_feedback_to"
|
|
67
|
+
]
|
|
68
|
+
|
|
69
|
+
feedback_interaction_pairs: List[Tuple[Tuple, Tuple]] = []
|
|
70
|
+
for source_id, target_id in feedback_edges:
|
|
71
|
+
source_id_str, target_id_str = str(source_id), str(target_id)
|
|
72
|
+
|
|
73
|
+
feedback_node = feedback_by_id.get(source_id_str)
|
|
74
|
+
interaction_node = interaction_by_id.get(target_id_str)
|
|
75
|
+
|
|
76
|
+
if feedback_node and interaction_node:
|
|
77
|
+
feedback_interaction_pairs.append((feedback_node, interaction_node))
|
|
78
|
+
|
|
79
|
+
return feedback_interaction_pairs
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def _sort_pairs_by_recency_and_limit(
|
|
83
|
+
feedback_interaction_pairs: List[Tuple[Tuple, Tuple]], last_n_limit: Optional[int]
|
|
84
|
+
) -> List[Tuple[Tuple, Tuple]]:
|
|
85
|
+
"""Sort by interaction created_at desc with updated_at fallback, then limit."""
|
|
86
|
+
|
|
87
|
+
def _recency_key(pair):
|
|
88
|
+
_, (_, interaction_props) = pair
|
|
89
|
+
created_at = interaction_props.get("created_at") or ""
|
|
90
|
+
updated_at = interaction_props.get("updated_at") or ""
|
|
91
|
+
return (created_at, updated_at)
|
|
92
|
+
|
|
93
|
+
sorted_pairs = sorted(feedback_interaction_pairs, key=_recency_key, reverse=True)
|
|
94
|
+
return sorted_pairs[: last_n_limit or len(sorted_pairs)]
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
async def _generate_human_readable_context_summary(
|
|
98
|
+
question_text: str, raw_context_text: str
|
|
99
|
+
) -> str:
|
|
100
|
+
"""Generate a concise human-readable summary for given context."""
|
|
101
|
+
try:
|
|
102
|
+
prompt = read_query_prompt("feedback_user_context_prompt.txt")
|
|
103
|
+
rendered = prompt.format(question=question_text, context=raw_context_text)
|
|
104
|
+
return await LLMGateway.acreate_structured_output(
|
|
105
|
+
text_input=rendered, system_prompt="", response_model=str
|
|
106
|
+
)
|
|
107
|
+
except Exception as exc: # noqa: BLE001
|
|
108
|
+
logger.warning("Failed to summarize context", error=str(exc))
|
|
109
|
+
return raw_context_text or ""
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def _has_required_feedback_fields(enrichment: FeedbackEnrichment) -> bool:
|
|
113
|
+
"""Validate required fields exist in the FeedbackEnrichment DataPoint."""
|
|
114
|
+
return (
|
|
115
|
+
enrichment.question is not None
|
|
116
|
+
and enrichment.original_answer is not None
|
|
117
|
+
and enrichment.context is not None
|
|
118
|
+
and enrichment.feedback_text is not None
|
|
119
|
+
and enrichment.feedback_id is not None
|
|
120
|
+
and enrichment.interaction_id is not None
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
async def _build_feedback_interaction_record(
|
|
125
|
+
feedback_node_id: str, feedback_props: Dict, interaction_node_id: str, interaction_props: Dict
|
|
126
|
+
) -> Optional[FeedbackEnrichment]:
|
|
127
|
+
"""Build a single FeedbackEnrichment DataPoint with context summary."""
|
|
128
|
+
try:
|
|
129
|
+
question_text = interaction_props.get("question")
|
|
130
|
+
original_answer_text = interaction_props.get("answer")
|
|
131
|
+
raw_context_text = interaction_props.get("context", "")
|
|
132
|
+
feedback_text = feedback_props.get("feedback") or feedback_props.get("text") or ""
|
|
133
|
+
|
|
134
|
+
context_summary_text = await _generate_human_readable_context_summary(
|
|
135
|
+
question_text or "", raw_context_text
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
enrichment = FeedbackEnrichment(
|
|
139
|
+
id=str(uuid5(NAMESPACE_OID, f"{question_text}_{interaction_node_id}")),
|
|
140
|
+
text="",
|
|
141
|
+
question=question_text,
|
|
142
|
+
original_answer=original_answer_text,
|
|
143
|
+
improved_answer="",
|
|
144
|
+
feedback_id=UUID(str(feedback_node_id)),
|
|
145
|
+
interaction_id=UUID(str(interaction_node_id)),
|
|
146
|
+
belongs_to_set=None,
|
|
147
|
+
context=context_summary_text,
|
|
148
|
+
feedback_text=feedback_text,
|
|
149
|
+
new_context="",
|
|
150
|
+
explanation="",
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
if _has_required_feedback_fields(enrichment):
|
|
154
|
+
return enrichment
|
|
155
|
+
else:
|
|
156
|
+
logger.warning("Skipping invalid feedback item", interaction=str(interaction_node_id))
|
|
157
|
+
return None
|
|
158
|
+
except Exception as exc: # noqa: BLE001
|
|
159
|
+
logger.error("Failed to process feedback pair", error=str(exc))
|
|
160
|
+
return None
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
async def _build_feedback_interaction_records(
|
|
164
|
+
matched_feedback_interaction_pairs: List[Tuple[Tuple, Tuple]],
|
|
165
|
+
) -> List[FeedbackEnrichment]:
|
|
166
|
+
"""Build all FeedbackEnrichment DataPoints from matched pairs."""
|
|
167
|
+
feedback_interaction_records: List[FeedbackEnrichment] = []
|
|
168
|
+
for (feedback_node_id, feedback_props), (
|
|
169
|
+
interaction_node_id,
|
|
170
|
+
interaction_props,
|
|
171
|
+
) in matched_feedback_interaction_pairs:
|
|
172
|
+
record = await _build_feedback_interaction_record(
|
|
173
|
+
feedback_node_id, feedback_props, interaction_node_id, interaction_props
|
|
174
|
+
)
|
|
175
|
+
if record:
|
|
176
|
+
feedback_interaction_records.append(record)
|
|
177
|
+
return feedback_interaction_records
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
async def extract_feedback_interactions(
|
|
181
|
+
data: Any, last_n: Optional[int] = None
|
|
182
|
+
) -> List[FeedbackEnrichment]:
|
|
183
|
+
"""Extract negative feedback-interaction pairs and create FeedbackEnrichment DataPoints."""
|
|
184
|
+
if not data or data == [{}]:
|
|
185
|
+
logger.info(
|
|
186
|
+
"No data passed to the extraction task (extraction task fetches data from graph directly)",
|
|
187
|
+
data=data,
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
graph_nodes, graph_edges = await _fetch_feedback_and_interaction_graph_data()
|
|
191
|
+
if not graph_nodes:
|
|
192
|
+
logger.warning("No graph nodes retrieved from database")
|
|
193
|
+
return []
|
|
194
|
+
|
|
195
|
+
feedback_nodes, interaction_nodes = _separate_feedback_and_interaction_nodes(graph_nodes)
|
|
196
|
+
logger.info(
|
|
197
|
+
"Retrieved nodes from graph",
|
|
198
|
+
total_nodes=len(graph_nodes),
|
|
199
|
+
feedback_nodes=len(feedback_nodes),
|
|
200
|
+
interaction_nodes=len(interaction_nodes),
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
negative_feedback_nodes = _filter_negative_feedback(feedback_nodes)
|
|
204
|
+
logger.info(
|
|
205
|
+
"Filtered feedback nodes",
|
|
206
|
+
total_feedback=len(feedback_nodes),
|
|
207
|
+
negative_feedback=len(negative_feedback_nodes),
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
if not negative_feedback_nodes:
|
|
211
|
+
logger.info("No negative feedback found; returning empty list")
|
|
212
|
+
return []
|
|
213
|
+
|
|
214
|
+
matched_feedback_interaction_pairs = _match_feedback_nodes_to_interactions_by_edges(
|
|
215
|
+
negative_feedback_nodes, interaction_nodes, graph_edges
|
|
216
|
+
)
|
|
217
|
+
if not matched_feedback_interaction_pairs:
|
|
218
|
+
logger.info("No feedback-to-interaction matches found; returning empty list")
|
|
219
|
+
return []
|
|
220
|
+
|
|
221
|
+
matched_feedback_interaction_pairs = _sort_pairs_by_recency_and_limit(
|
|
222
|
+
matched_feedback_interaction_pairs, last_n
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
feedback_interaction_records = await _build_feedback_interaction_records(
|
|
226
|
+
matched_feedback_interaction_pairs
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
logger.info("Extracted feedback pairs", count=len(feedback_interaction_records))
|
|
230
|
+
return feedback_interaction_records
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import List, Optional
|
|
4
|
+
from pydantic import BaseModel
|
|
5
|
+
|
|
6
|
+
from cognee.infrastructure.llm import LLMGateway
|
|
7
|
+
from cognee.infrastructure.llm.prompts.read_query_prompt import read_query_prompt
|
|
8
|
+
from cognee.modules.graph.utils import resolve_edges_to_text
|
|
9
|
+
from cognee.shared.logging_utils import get_logger
|
|
10
|
+
|
|
11
|
+
from cognee.modules.retrieval.graph_completion_cot_retriever import GraphCompletionCotRetriever
|
|
12
|
+
from .models import FeedbackEnrichment
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class ImprovedAnswerResponse(BaseModel):
|
|
16
|
+
"""Response model for improved answer generation containing answer and explanation."""
|
|
17
|
+
|
|
18
|
+
answer: str
|
|
19
|
+
explanation: str
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
logger = get_logger("generate_improved_answers")
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _validate_input_data(enrichments: List[FeedbackEnrichment]) -> bool:
|
|
26
|
+
"""Validate that input contains required fields for all enrichments."""
|
|
27
|
+
return all(
|
|
28
|
+
enrichment.question is not None
|
|
29
|
+
and enrichment.original_answer is not None
|
|
30
|
+
and enrichment.context is not None
|
|
31
|
+
and enrichment.feedback_text is not None
|
|
32
|
+
and enrichment.feedback_id is not None
|
|
33
|
+
and enrichment.interaction_id is not None
|
|
34
|
+
for enrichment in enrichments
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _render_reaction_prompt(
|
|
39
|
+
question: str, context: str, wrong_answer: str, negative_feedback: str
|
|
40
|
+
) -> str:
|
|
41
|
+
"""Render the feedback reaction prompt with provided variables."""
|
|
42
|
+
prompt_template = read_query_prompt("feedback_reaction_prompt.txt")
|
|
43
|
+
return prompt_template.format(
|
|
44
|
+
question=question,
|
|
45
|
+
context=context,
|
|
46
|
+
wrong_answer=wrong_answer,
|
|
47
|
+
negative_feedback=negative_feedback,
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
async def _generate_improved_answer_for_single_interaction(
|
|
52
|
+
enrichment: FeedbackEnrichment, retriever, reaction_prompt_location: str
|
|
53
|
+
) -> Optional[FeedbackEnrichment]:
|
|
54
|
+
"""Generate improved answer for a single enrichment using structured retriever completion."""
|
|
55
|
+
try:
|
|
56
|
+
query_text = _render_reaction_prompt(
|
|
57
|
+
enrichment.question,
|
|
58
|
+
enrichment.context,
|
|
59
|
+
enrichment.original_answer,
|
|
60
|
+
enrichment.feedback_text,
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
retrieved_context = await retriever.get_context(query_text)
|
|
64
|
+
completion = await retriever.get_structured_completion(
|
|
65
|
+
query=query_text,
|
|
66
|
+
context=retrieved_context,
|
|
67
|
+
response_model=ImprovedAnswerResponse,
|
|
68
|
+
max_iter=4,
|
|
69
|
+
)
|
|
70
|
+
new_context_text = await retriever.resolve_edges_to_text(retrieved_context)
|
|
71
|
+
|
|
72
|
+
if completion:
|
|
73
|
+
enrichment.improved_answer = completion.answer
|
|
74
|
+
enrichment.new_context = new_context_text
|
|
75
|
+
enrichment.explanation = completion.explanation
|
|
76
|
+
return enrichment
|
|
77
|
+
else:
|
|
78
|
+
logger.warning(
|
|
79
|
+
"Failed to get structured completion from retriever", question=enrichment.question
|
|
80
|
+
)
|
|
81
|
+
return None
|
|
82
|
+
|
|
83
|
+
except Exception as exc: # noqa: BLE001
|
|
84
|
+
logger.error(
|
|
85
|
+
"Failed to generate improved answer",
|
|
86
|
+
error=str(exc),
|
|
87
|
+
question=enrichment.question,
|
|
88
|
+
)
|
|
89
|
+
return None
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
async def generate_improved_answers(
|
|
93
|
+
enrichments: List[FeedbackEnrichment],
|
|
94
|
+
top_k: int = 20,
|
|
95
|
+
reaction_prompt_location: str = "feedback_reaction_prompt.txt",
|
|
96
|
+
) -> List[FeedbackEnrichment]:
|
|
97
|
+
"""Generate improved answers using CoT retriever and LLM."""
|
|
98
|
+
if not enrichments:
|
|
99
|
+
logger.info("No enrichments provided; returning empty list")
|
|
100
|
+
return []
|
|
101
|
+
|
|
102
|
+
if not _validate_input_data(enrichments):
|
|
103
|
+
logger.error("Input data validation failed; missing required fields")
|
|
104
|
+
return []
|
|
105
|
+
|
|
106
|
+
retriever = GraphCompletionCotRetriever(
|
|
107
|
+
top_k=top_k,
|
|
108
|
+
save_interaction=False,
|
|
109
|
+
user_prompt_path="graph_context_for_question.txt",
|
|
110
|
+
system_prompt_path="answer_simple_question.txt",
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
improved_answers: List[FeedbackEnrichment] = []
|
|
114
|
+
|
|
115
|
+
for enrichment in enrichments:
|
|
116
|
+
result = await _generate_improved_answer_for_single_interaction(
|
|
117
|
+
enrichment, retriever, reaction_prompt_location
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
if result:
|
|
121
|
+
improved_answers.append(result)
|
|
122
|
+
else:
|
|
123
|
+
logger.warning(
|
|
124
|
+
"Failed to generate improved answer",
|
|
125
|
+
question=enrichment.question,
|
|
126
|
+
interaction_id=enrichment.interaction_id,
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
logger.info("Generated improved answers", count=len(improved_answers))
|
|
130
|
+
return improved_answers
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import List, Tuple
|
|
4
|
+
from uuid import UUID
|
|
5
|
+
|
|
6
|
+
from cognee.infrastructure.databases.graph import get_graph_engine
|
|
7
|
+
from cognee.tasks.storage import index_graph_edges
|
|
8
|
+
from cognee.shared.logging_utils import get_logger
|
|
9
|
+
|
|
10
|
+
from .models import FeedbackEnrichment
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
logger = get_logger("link_enrichments_to_feedback")
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _create_edge_tuple(
|
|
17
|
+
source_id: UUID, target_id: UUID, relationship_name: str
|
|
18
|
+
) -> Tuple[UUID, UUID, str, dict]:
|
|
19
|
+
"""Create an edge tuple with proper properties structure."""
|
|
20
|
+
return (
|
|
21
|
+
source_id,
|
|
22
|
+
target_id,
|
|
23
|
+
relationship_name,
|
|
24
|
+
{
|
|
25
|
+
"relationship_name": relationship_name,
|
|
26
|
+
"source_node_id": source_id,
|
|
27
|
+
"target_node_id": target_id,
|
|
28
|
+
"ontology_valid": False,
|
|
29
|
+
},
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
async def link_enrichments_to_feedback(
|
|
34
|
+
enrichments: List[FeedbackEnrichment],
|
|
35
|
+
) -> List[FeedbackEnrichment]:
|
|
36
|
+
"""Manually create edges from enrichments to original feedback/interaction nodes."""
|
|
37
|
+
if not enrichments:
|
|
38
|
+
logger.info("No enrichments provided; returning empty list")
|
|
39
|
+
return []
|
|
40
|
+
|
|
41
|
+
relationships = []
|
|
42
|
+
|
|
43
|
+
for enrichment in enrichments:
|
|
44
|
+
enrichment_id = enrichment.id
|
|
45
|
+
feedback_id = enrichment.feedback_id
|
|
46
|
+
interaction_id = enrichment.interaction_id
|
|
47
|
+
|
|
48
|
+
if enrichment_id and feedback_id:
|
|
49
|
+
enriches_feedback_edge = _create_edge_tuple(
|
|
50
|
+
enrichment_id, feedback_id, "enriches_feedback"
|
|
51
|
+
)
|
|
52
|
+
relationships.append(enriches_feedback_edge)
|
|
53
|
+
|
|
54
|
+
if enrichment_id and interaction_id:
|
|
55
|
+
improves_interaction_edge = _create_edge_tuple(
|
|
56
|
+
enrichment_id, interaction_id, "improves_interaction"
|
|
57
|
+
)
|
|
58
|
+
relationships.append(improves_interaction_edge)
|
|
59
|
+
|
|
60
|
+
if relationships:
|
|
61
|
+
graph_engine = await get_graph_engine()
|
|
62
|
+
await graph_engine.add_edges(relationships)
|
|
63
|
+
await index_graph_edges(relationships)
|
|
64
|
+
logger.info("Linking enrichments to feedback", edge_count=len(relationships))
|
|
65
|
+
|
|
66
|
+
logger.info("Linked enrichments", enrichment_count=len(enrichments))
|
|
67
|
+
return enrichments
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
from typing import List, Optional, Union
|
|
2
|
+
from uuid import UUID
|
|
3
|
+
|
|
4
|
+
from cognee.infrastructure.engine import DataPoint
|
|
5
|
+
from cognee.modules.engine.models import Entity, NodeSet
|
|
6
|
+
from cognee.tasks.temporal_graph.models import Event
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class FeedbackEnrichment(DataPoint):
|
|
10
|
+
"""Minimal DataPoint for feedback enrichment that works with extract_graph_from_data."""
|
|
11
|
+
|
|
12
|
+
text: str
|
|
13
|
+
contains: Optional[List[Union[Entity, Event]]] = None
|
|
14
|
+
metadata: dict = {"index_fields": ["text"]}
|
|
15
|
+
|
|
16
|
+
question: str
|
|
17
|
+
original_answer: str
|
|
18
|
+
improved_answer: str
|
|
19
|
+
feedback_id: UUID
|
|
20
|
+
interaction_id: UUID
|
|
21
|
+
belongs_to_set: Optional[List[NodeSet]] = None
|
|
22
|
+
|
|
23
|
+
context: str = ""
|
|
24
|
+
feedback_text: str = ""
|
|
25
|
+
new_context: str = ""
|
|
26
|
+
explanation: str = ""
|
|
@@ -4,6 +4,7 @@ from pydantic import BaseModel
|
|
|
4
4
|
|
|
5
5
|
from cognee.infrastructure.databases.graph import get_graph_engine
|
|
6
6
|
from cognee.modules.ontology.ontology_env_config import get_ontology_env_config
|
|
7
|
+
from cognee.tasks.storage import index_graph_edges
|
|
7
8
|
from cognee.tasks.storage.add_data_points import add_data_points
|
|
8
9
|
from cognee.modules.ontology.ontology_config import Config
|
|
9
10
|
from cognee.modules.ontology.get_default_ontology_resolver import (
|
|
@@ -88,6 +89,7 @@ async def integrate_chunk_graphs(
|
|
|
88
89
|
|
|
89
90
|
if len(graph_edges) > 0:
|
|
90
91
|
await graph_engine.add_edges(graph_edges)
|
|
92
|
+
await index_graph_edges(graph_edges)
|
|
91
93
|
|
|
92
94
|
return data_chunks
|
|
93
95
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import os
|
|
2
2
|
from urllib.parse import urlparse
|
|
3
|
-
from typing import List, Tuple
|
|
3
|
+
from typing import Any, List, Tuple
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
import tempfile
|
|
6
6
|
|
|
@@ -34,7 +34,8 @@ async def pull_from_s3(file_path, destination_file) -> None:
|
|
|
34
34
|
|
|
35
35
|
|
|
36
36
|
async def data_item_to_text_file(
|
|
37
|
-
data_item_path: str,
|
|
37
|
+
data_item_path: str,
|
|
38
|
+
preferred_loaders: dict[str, dict[str, Any]] = None,
|
|
38
39
|
) -> Tuple[str, LoaderInterface]:
|
|
39
40
|
if isinstance(data_item_path, str):
|
|
40
41
|
parsed_url = urlparse(data_item_path)
|
|
@@ -74,6 +75,5 @@ async def data_item_to_text_file(
|
|
|
74
75
|
)
|
|
75
76
|
else:
|
|
76
77
|
raise IngestionError(message="Local files are not accepted.")
|
|
77
|
-
|
|
78
78
|
# data is not a supported type
|
|
79
79
|
raise IngestionError(message=f"Data type not supported: {type(data_item_path)}")
|
|
@@ -6,6 +6,7 @@ from typing import Union, BinaryIO, Any, List, Optional
|
|
|
6
6
|
import cognee.modules.ingestion as ingestion
|
|
7
7
|
from cognee.infrastructure.databases.relational import get_relational_engine
|
|
8
8
|
from cognee.modules.data.models import Data
|
|
9
|
+
from cognee.modules.ingestion.exceptions import IngestionError
|
|
9
10
|
from cognee.modules.users.models import User
|
|
10
11
|
from cognee.modules.users.methods import get_default_user
|
|
11
12
|
from cognee.modules.users.permissions.methods import get_specific_user_permission_datasets
|
|
@@ -27,7 +28,7 @@ async def ingest_data(
|
|
|
27
28
|
user: User,
|
|
28
29
|
node_set: Optional[List[str]] = None,
|
|
29
30
|
dataset_id: UUID = None,
|
|
30
|
-
preferred_loaders:
|
|
31
|
+
preferred_loaders: dict[str, dict[str, Any]] = None,
|
|
31
32
|
):
|
|
32
33
|
if not user:
|
|
33
34
|
user = await get_default_user()
|
|
@@ -44,7 +45,7 @@ async def ingest_data(
|
|
|
44
45
|
user: User,
|
|
45
46
|
node_set: Optional[List[str]] = None,
|
|
46
47
|
dataset_id: UUID = None,
|
|
47
|
-
preferred_loaders:
|
|
48
|
+
preferred_loaders: dict[str, dict[str, Any]] = None,
|
|
48
49
|
):
|
|
49
50
|
new_datapoints = []
|
|
50
51
|
existing_data_points = []
|
|
@@ -77,22 +78,27 @@ async def ingest_data(
|
|
|
77
78
|
dataset_data_map = {str(data.id): True for data in dataset_data}
|
|
78
79
|
|
|
79
80
|
for data_item in data:
|
|
80
|
-
# Get file path of data item or create a file it doesn't exist
|
|
81
|
+
# Get file path of data item or create a file if it doesn't exist
|
|
81
82
|
original_file_path = await save_data_item_to_storage(data_item)
|
|
82
|
-
|
|
83
83
|
# Transform file path to be OS usable
|
|
84
84
|
actual_file_path = get_data_file_path(original_file_path)
|
|
85
85
|
|
|
86
86
|
# Store all input data as text files in Cognee data storage
|
|
87
87
|
cognee_storage_file_path, loader_engine = await data_item_to_text_file(
|
|
88
|
-
actual_file_path,
|
|
88
|
+
actual_file_path,
|
|
89
|
+
preferred_loaders,
|
|
89
90
|
)
|
|
90
91
|
|
|
92
|
+
if loader_engine is None:
|
|
93
|
+
raise IngestionError("Loader cannot be None")
|
|
94
|
+
|
|
91
95
|
# Find metadata from original file
|
|
96
|
+
# Standard flow: extract metadata from both original and stored files
|
|
92
97
|
async with open_data_file(original_file_path) as file:
|
|
93
98
|
classified_data = ingestion.classify(file)
|
|
94
99
|
|
|
95
100
|
# data_id is the hash of original file contents + owner id to avoid duplicate data
|
|
101
|
+
|
|
96
102
|
data_id = ingestion.identify(classified_data, user)
|
|
97
103
|
original_file_metadata = classified_data.get_metadata()
|
|
98
104
|
|
|
@@ -8,6 +8,9 @@ from cognee.modules.ingestion import save_data_to_file
|
|
|
8
8
|
from cognee.shared.logging_utils import get_logger
|
|
9
9
|
from pydantic_settings import BaseSettings, SettingsConfigDict
|
|
10
10
|
|
|
11
|
+
from cognee.tasks.web_scraper.utils import fetch_page_content
|
|
12
|
+
|
|
13
|
+
|
|
11
14
|
logger = get_logger()
|
|
12
15
|
|
|
13
16
|
|
|
@@ -27,6 +30,12 @@ async def save_data_item_to_storage(data_item: Union[BinaryIO, str, Any]) -> str
|
|
|
27
30
|
|
|
28
31
|
return await get_data_from_llama_index(data_item)
|
|
29
32
|
|
|
33
|
+
if "docling" in str(type(data_item)):
|
|
34
|
+
from docling_core.types import DoclingDocument
|
|
35
|
+
|
|
36
|
+
if isinstance(data_item, DoclingDocument):
|
|
37
|
+
data_item = data_item.export_to_text()
|
|
38
|
+
|
|
30
39
|
# data is a file object coming from upload.
|
|
31
40
|
if hasattr(data_item, "file"):
|
|
32
41
|
return await save_data_to_file(data_item.file, filename=data_item.filename)
|
|
@@ -48,7 +57,9 @@ async def save_data_item_to_storage(data_item: Union[BinaryIO, str, Any]) -> str
|
|
|
48
57
|
# data is s3 file path
|
|
49
58
|
if parsed_url.scheme == "s3":
|
|
50
59
|
return data_item
|
|
51
|
-
|
|
60
|
+
elif parsed_url.scheme == "http" or parsed_url.scheme == "https":
|
|
61
|
+
urls_to_page_contents = await fetch_page_content(data_item)
|
|
62
|
+
return await save_data_to_file(urls_to_page_contents[data_item], file_extension="html")
|
|
52
63
|
# data is local file path
|
|
53
64
|
elif parsed_url.scheme == "file":
|
|
54
65
|
if settings.accept_local_file_path:
|