cognee 0.2.1.dev7__py3-none-any.whl → 0.2.2.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (223) hide show
  1. cognee/api/client.py +44 -4
  2. cognee/api/health.py +332 -0
  3. cognee/api/v1/add/add.py +5 -2
  4. cognee/api/v1/add/routers/get_add_router.py +3 -0
  5. cognee/api/v1/cognify/code_graph_pipeline.py +3 -1
  6. cognee/api/v1/cognify/cognify.py +8 -0
  7. cognee/api/v1/cognify/routers/get_cognify_router.py +8 -1
  8. cognee/api/v1/config/config.py +3 -1
  9. cognee/api/v1/datasets/routers/get_datasets_router.py +2 -8
  10. cognee/api/v1/delete/delete.py +16 -12
  11. cognee/api/v1/responses/routers/get_responses_router.py +3 -1
  12. cognee/api/v1/search/search.py +10 -0
  13. cognee/api/v1/settings/routers/get_settings_router.py +0 -2
  14. cognee/base_config.py +1 -0
  15. cognee/eval_framework/evaluation/direct_llm_eval_adapter.py +5 -6
  16. cognee/infrastructure/databases/graph/config.py +2 -0
  17. cognee/infrastructure/databases/graph/get_graph_engine.py +58 -12
  18. cognee/infrastructure/databases/graph/graph_db_interface.py +15 -10
  19. cognee/infrastructure/databases/graph/kuzu/adapter.py +43 -16
  20. cognee/infrastructure/databases/graph/kuzu/kuzu_migrate.py +281 -0
  21. cognee/infrastructure/databases/graph/neo4j_driver/adapter.py +151 -77
  22. cognee/infrastructure/databases/graph/neptune_driver/__init__.py +15 -0
  23. cognee/infrastructure/databases/graph/neptune_driver/adapter.py +1427 -0
  24. cognee/infrastructure/databases/graph/neptune_driver/exceptions.py +115 -0
  25. cognee/infrastructure/databases/graph/neptune_driver/neptune_utils.py +224 -0
  26. cognee/infrastructure/databases/graph/networkx/adapter.py +3 -3
  27. cognee/infrastructure/databases/hybrid/neptune_analytics/NeptuneAnalyticsAdapter.py +449 -0
  28. cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py +11 -3
  29. cognee/infrastructure/databases/vector/chromadb/ChromaDBAdapter.py +8 -3
  30. cognee/infrastructure/databases/vector/create_vector_engine.py +31 -23
  31. cognee/infrastructure/databases/vector/embeddings/FastembedEmbeddingEngine.py +3 -1
  32. cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py +21 -6
  33. cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py +4 -3
  34. cognee/infrastructure/databases/vector/embeddings/get_embedding_engine.py +3 -1
  35. cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py +22 -16
  36. cognee/infrastructure/databases/vector/pgvector/PGVectorAdapter.py +36 -34
  37. cognee/infrastructure/databases/vector/vector_db_interface.py +78 -7
  38. cognee/infrastructure/files/utils/get_data_file_path.py +39 -0
  39. cognee/infrastructure/files/utils/guess_file_type.py +2 -2
  40. cognee/infrastructure/files/utils/open_data_file.py +4 -23
  41. cognee/infrastructure/llm/LLMGateway.py +137 -0
  42. cognee/infrastructure/llm/__init__.py +14 -4
  43. cognee/infrastructure/llm/config.py +29 -1
  44. cognee/infrastructure/llm/prompts/answer_hotpot_question.txt +1 -1
  45. cognee/infrastructure/llm/prompts/answer_hotpot_using_cognee_search.txt +1 -1
  46. cognee/infrastructure/llm/prompts/answer_simple_question.txt +1 -1
  47. cognee/infrastructure/llm/prompts/answer_simple_question_restricted.txt +1 -1
  48. cognee/infrastructure/llm/prompts/categorize_categories.txt +1 -1
  49. cognee/infrastructure/llm/prompts/classify_content.txt +1 -1
  50. cognee/infrastructure/llm/prompts/context_for_question.txt +1 -1
  51. cognee/infrastructure/llm/prompts/graph_context_for_question.txt +1 -1
  52. cognee/infrastructure/llm/prompts/natural_language_retriever_system.txt +1 -1
  53. cognee/infrastructure/llm/prompts/patch_gen_instructions.txt +1 -1
  54. cognee/infrastructure/llm/prompts/search_type_selector_prompt.txt +130 -0
  55. cognee/infrastructure/llm/prompts/summarize_code.txt +2 -2
  56. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/__init__.py +57 -0
  57. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/async_client.py +533 -0
  58. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/config.py +94 -0
  59. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/globals.py +37 -0
  60. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/inlinedbaml.py +21 -0
  61. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/parser.py +131 -0
  62. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/runtime.py +266 -0
  63. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/stream_types.py +137 -0
  64. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/sync_client.py +550 -0
  65. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/tracing.py +26 -0
  66. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/type_builder.py +962 -0
  67. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/type_map.py +52 -0
  68. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/types.py +166 -0
  69. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extract_categories.baml +109 -0
  70. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extract_content_graph.baml +343 -0
  71. cognee/{modules/data → infrastructure/llm/structured_output_framework/baml/baml_src}/extraction/__init__.py +1 -0
  72. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/extract_summary.py +89 -0
  73. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/knowledge_graph/extract_content_graph.py +33 -0
  74. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/generators.baml +18 -0
  75. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/extraction/__init__.py +3 -0
  76. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/extraction/extract_categories.py +12 -0
  77. cognee/{modules/data → infrastructure/llm/structured_output_framework/litellm_instructor}/extraction/extract_summary.py +16 -7
  78. cognee/{modules/data → infrastructure/llm/structured_output_framework/litellm_instructor}/extraction/knowledge_graph/extract_content_graph.py +7 -6
  79. cognee/infrastructure/llm/{anthropic → structured_output_framework/litellm_instructor/llm/anthropic}/adapter.py +10 -4
  80. cognee/infrastructure/llm/{gemini → structured_output_framework/litellm_instructor/llm/gemini}/adapter.py +6 -5
  81. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/__init__.py +0 -0
  82. cognee/infrastructure/llm/{generic_llm_api → structured_output_framework/litellm_instructor/llm/generic_llm_api}/adapter.py +7 -3
  83. cognee/infrastructure/llm/{get_llm_client.py → structured_output_framework/litellm_instructor/llm/get_llm_client.py} +18 -6
  84. cognee/infrastructure/llm/{llm_interface.py → structured_output_framework/litellm_instructor/llm/llm_interface.py} +2 -2
  85. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/ollama/__init__.py +0 -0
  86. cognee/infrastructure/llm/{ollama → structured_output_framework/litellm_instructor/llm/ollama}/adapter.py +4 -2
  87. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/__init__.py +0 -0
  88. cognee/infrastructure/llm/{openai → structured_output_framework/litellm_instructor/llm/openai}/adapter.py +6 -4
  89. cognee/infrastructure/llm/{rate_limiter.py → structured_output_framework/litellm_instructor/llm/rate_limiter.py} +0 -5
  90. cognee/infrastructure/llm/tokenizer/Gemini/adapter.py +4 -2
  91. cognee/infrastructure/llm/tokenizer/TikToken/adapter.py +7 -3
  92. cognee/infrastructure/llm/tokenizer/__init__.py +4 -0
  93. cognee/infrastructure/llm/utils.py +3 -1
  94. cognee/infrastructure/loaders/LoaderEngine.py +156 -0
  95. cognee/infrastructure/loaders/LoaderInterface.py +73 -0
  96. cognee/infrastructure/loaders/__init__.py +18 -0
  97. cognee/infrastructure/loaders/core/__init__.py +7 -0
  98. cognee/infrastructure/loaders/core/audio_loader.py +98 -0
  99. cognee/infrastructure/loaders/core/image_loader.py +114 -0
  100. cognee/infrastructure/loaders/core/text_loader.py +90 -0
  101. cognee/infrastructure/loaders/create_loader_engine.py +32 -0
  102. cognee/infrastructure/loaders/external/__init__.py +22 -0
  103. cognee/infrastructure/loaders/external/pypdf_loader.py +96 -0
  104. cognee/infrastructure/loaders/external/unstructured_loader.py +127 -0
  105. cognee/infrastructure/loaders/get_loader_engine.py +18 -0
  106. cognee/infrastructure/loaders/supported_loaders.py +18 -0
  107. cognee/infrastructure/loaders/use_loader.py +21 -0
  108. cognee/infrastructure/loaders/utils/__init__.py +0 -0
  109. cognee/modules/data/methods/__init__.py +1 -0
  110. cognee/modules/data/methods/get_authorized_dataset.py +23 -0
  111. cognee/modules/data/models/Data.py +13 -3
  112. cognee/modules/data/processing/document_types/AudioDocument.py +2 -2
  113. cognee/modules/data/processing/document_types/ImageDocument.py +2 -2
  114. cognee/modules/data/processing/document_types/PdfDocument.py +4 -11
  115. cognee/modules/data/processing/document_types/UnstructuredDocument.py +2 -5
  116. cognee/modules/engine/utils/generate_edge_id.py +5 -0
  117. cognee/modules/graph/cognee_graph/CogneeGraph.py +45 -35
  118. cognee/modules/graph/methods/get_formatted_graph_data.py +8 -2
  119. cognee/modules/graph/utils/get_graph_from_model.py +93 -101
  120. cognee/modules/ingestion/data_types/TextData.py +8 -2
  121. cognee/modules/ingestion/save_data_to_file.py +1 -1
  122. cognee/modules/pipelines/exceptions/__init__.py +1 -0
  123. cognee/modules/pipelines/exceptions/exceptions.py +12 -0
  124. cognee/modules/pipelines/models/DataItemStatus.py +5 -0
  125. cognee/modules/pipelines/models/PipelineRunInfo.py +6 -0
  126. cognee/modules/pipelines/models/__init__.py +1 -0
  127. cognee/modules/pipelines/operations/pipeline.py +10 -2
  128. cognee/modules/pipelines/operations/run_tasks.py +252 -20
  129. cognee/modules/pipelines/operations/run_tasks_distributed.py +1 -1
  130. cognee/modules/retrieval/chunks_retriever.py +23 -1
  131. cognee/modules/retrieval/code_retriever.py +66 -9
  132. cognee/modules/retrieval/completion_retriever.py +11 -9
  133. cognee/modules/retrieval/context_providers/TripletSearchContextProvider.py +0 -2
  134. cognee/modules/retrieval/graph_completion_context_extension_retriever.py +0 -2
  135. cognee/modules/retrieval/graph_completion_cot_retriever.py +8 -9
  136. cognee/modules/retrieval/graph_completion_retriever.py +1 -1
  137. cognee/modules/retrieval/insights_retriever.py +4 -0
  138. cognee/modules/retrieval/natural_language_retriever.py +9 -15
  139. cognee/modules/retrieval/summaries_retriever.py +23 -1
  140. cognee/modules/retrieval/utils/brute_force_triplet_search.py +23 -4
  141. cognee/modules/retrieval/utils/completion.py +6 -9
  142. cognee/modules/retrieval/utils/description_to_codepart_search.py +2 -3
  143. cognee/modules/search/methods/search.py +5 -1
  144. cognee/modules/search/operations/__init__.py +1 -0
  145. cognee/modules/search/operations/select_search_type.py +42 -0
  146. cognee/modules/search/types/SearchType.py +1 -0
  147. cognee/modules/settings/get_settings.py +0 -8
  148. cognee/modules/settings/save_vector_db_config.py +1 -1
  149. cognee/shared/data_models.py +3 -1
  150. cognee/shared/logging_utils.py +0 -5
  151. cognee/tasks/chunk_naive_llm_classifier/chunk_naive_llm_classifier.py +2 -2
  152. cognee/tasks/documents/extract_chunks_from_documents.py +10 -12
  153. cognee/tasks/entity_completion/entity_extractors/llm_entity_extractor.py +4 -6
  154. cognee/tasks/graph/cascade_extract/utils/extract_content_nodes_and_relationship_names.py +4 -6
  155. cognee/tasks/graph/cascade_extract/utils/extract_edge_triplets.py +6 -7
  156. cognee/tasks/graph/cascade_extract/utils/extract_nodes.py +4 -7
  157. cognee/tasks/graph/extract_graph_from_code.py +3 -2
  158. cognee/tasks/graph/extract_graph_from_data.py +4 -3
  159. cognee/tasks/graph/infer_data_ontology.py +5 -6
  160. cognee/tasks/ingestion/data_item_to_text_file.py +79 -0
  161. cognee/tasks/ingestion/ingest_data.py +91 -61
  162. cognee/tasks/ingestion/resolve_data_directories.py +3 -0
  163. cognee/tasks/repo_processor/get_repo_file_dependencies.py +3 -0
  164. cognee/tasks/storage/index_data_points.py +1 -1
  165. cognee/tasks/storage/index_graph_edges.py +4 -1
  166. cognee/tasks/summarization/summarize_code.py +2 -3
  167. cognee/tasks/summarization/summarize_text.py +3 -2
  168. cognee/tests/test_cognee_server_start.py +12 -7
  169. cognee/tests/test_deduplication.py +2 -2
  170. cognee/tests/test_deletion.py +58 -17
  171. cognee/tests/test_graph_visualization_permissions.py +161 -0
  172. cognee/tests/test_neptune_analytics_graph.py +309 -0
  173. cognee/tests/test_neptune_analytics_hybrid.py +176 -0
  174. cognee/tests/{test_weaviate.py → test_neptune_analytics_vector.py} +86 -11
  175. cognee/tests/test_pgvector.py +5 -5
  176. cognee/tests/test_s3.py +1 -6
  177. cognee/tests/unit/infrastructure/databases/test_rate_limiter.py +11 -10
  178. cognee/tests/unit/infrastructure/databases/vector/__init__.py +0 -0
  179. cognee/tests/unit/infrastructure/mock_embedding_engine.py +1 -1
  180. cognee/tests/unit/infrastructure/test_embedding_rate_limiting_realistic.py +5 -5
  181. cognee/tests/unit/infrastructure/test_rate_limiting_realistic.py +6 -4
  182. cognee/tests/unit/infrastructure/test_rate_limiting_retry.py +1 -1
  183. cognee/tests/unit/interfaces/graph/get_graph_from_model_unit_test.py +61 -3
  184. cognee/tests/unit/modules/retrieval/graph_completion_retriever_test.py +84 -9
  185. cognee/tests/unit/modules/search/search_methods_test.py +55 -0
  186. {cognee-0.2.1.dev7.dist-info → cognee-0.2.2.dev1.dist-info}/METADATA +13 -9
  187. {cognee-0.2.1.dev7.dist-info → cognee-0.2.2.dev1.dist-info}/RECORD +203 -164
  188. cognee/infrastructure/databases/vector/pinecone/adapter.py +0 -8
  189. cognee/infrastructure/databases/vector/qdrant/QDrantAdapter.py +0 -514
  190. cognee/infrastructure/databases/vector/qdrant/__init__.py +0 -2
  191. cognee/infrastructure/databases/vector/weaviate_db/WeaviateAdapter.py +0 -527
  192. cognee/infrastructure/databases/vector/weaviate_db/__init__.py +0 -1
  193. cognee/modules/data/extraction/extract_categories.py +0 -14
  194. cognee/tests/test_qdrant.py +0 -99
  195. distributed/Dockerfile +0 -34
  196. distributed/app.py +0 -4
  197. distributed/entrypoint.py +0 -71
  198. distributed/entrypoint.sh +0 -5
  199. distributed/modal_image.py +0 -11
  200. distributed/queues.py +0 -5
  201. distributed/tasks/queued_add_data_points.py +0 -13
  202. distributed/tasks/queued_add_edges.py +0 -13
  203. distributed/tasks/queued_add_nodes.py +0 -13
  204. distributed/test.py +0 -28
  205. distributed/utils.py +0 -19
  206. distributed/workers/data_point_saving_worker.py +0 -93
  207. distributed/workers/graph_saving_worker.py +0 -104
  208. /cognee/infrastructure/databases/{graph/memgraph → hybrid/neptune_analytics}/__init__.py +0 -0
  209. /cognee/infrastructure/{llm → databases/vector/embeddings}/embedding_rate_limiter.py +0 -0
  210. /cognee/infrastructure/{databases/vector/pinecone → llm/structured_output_framework}/__init__.py +0 -0
  211. /cognee/infrastructure/llm/{anthropic → structured_output_framework/baml/baml_src}/__init__.py +0 -0
  212. /cognee/infrastructure/llm/{gemini/__init__.py → structured_output_framework/baml/baml_src/extraction/extract_categories.py} +0 -0
  213. /cognee/infrastructure/llm/{generic_llm_api → structured_output_framework/baml/baml_src/extraction/knowledge_graph}/__init__.py +0 -0
  214. /cognee/infrastructure/llm/{ollama → structured_output_framework/litellm_instructor}/__init__.py +0 -0
  215. /cognee/{modules/data → infrastructure/llm/structured_output_framework/litellm_instructor}/extraction/knowledge_graph/__init__.py +0 -0
  216. /cognee/{modules/data → infrastructure/llm/structured_output_framework/litellm_instructor}/extraction/texts.json +0 -0
  217. /cognee/infrastructure/llm/{openai → structured_output_framework/litellm_instructor/llm}/__init__.py +0 -0
  218. {distributed → cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic}/__init__.py +0 -0
  219. {distributed/tasks → cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini}/__init__.py +0 -0
  220. /cognee/modules/data/{extraction/knowledge_graph → methods}/add_model_class_to_graph.py +0 -0
  221. {cognee-0.2.1.dev7.dist-info → cognee-0.2.2.dev1.dist-info}/WHEEL +0 -0
  222. {cognee-0.2.1.dev7.dist-info → cognee-0.2.2.dev1.dist-info}/licenses/LICENSE +0 -0
  223. {cognee-0.2.1.dev7.dist-info → cognee-0.2.2.dev1.dist-info}/licenses/NOTICE.md +0 -0
@@ -0,0 +1,52 @@
1
+ # ----------------------------------------------------------------------------
2
+ #
3
+ # Welcome to Baml! To use this generated code, please run the following:
4
+ #
5
+ # $ pip install baml
6
+ #
7
+ # ----------------------------------------------------------------------------
8
+
9
+ # This file was generated by BAML: please do not edit it. Instead, edit the
10
+ # BAML files and re-generate this code using: baml-cli generate
11
+ # baml-cli is available with the baml package.
12
+
13
+ from . import types
14
+ from . import stream_types
15
+
16
+
17
+ type_map = {
18
+ "types.AudioContent": types.AudioContent,
19
+ "stream_types.AudioContent": stream_types.AudioContent,
20
+ "types.ContentLabel": types.ContentLabel,
21
+ "stream_types.ContentLabel": stream_types.ContentLabel,
22
+ "types.DefaultContentPrediction": types.DefaultContentPrediction,
23
+ "stream_types.DefaultContentPrediction": stream_types.DefaultContentPrediction,
24
+ "types.DynamicKnowledgeGraph": types.DynamicKnowledgeGraph,
25
+ "stream_types.DynamicKnowledgeGraph": stream_types.DynamicKnowledgeGraph,
26
+ "types.Edge": types.Edge,
27
+ "stream_types.Edge": stream_types.Edge,
28
+ "types.ImageContent": types.ImageContent,
29
+ "stream_types.ImageContent": stream_types.ImageContent,
30
+ "types.KnowledgeGraph": types.KnowledgeGraph,
31
+ "stream_types.KnowledgeGraph": stream_types.KnowledgeGraph,
32
+ "types.Model3DContent": types.Model3DContent,
33
+ "stream_types.Model3DContent": stream_types.Model3DContent,
34
+ "types.MultimediaContent": types.MultimediaContent,
35
+ "stream_types.MultimediaContent": stream_types.MultimediaContent,
36
+ "types.Node": types.Node,
37
+ "stream_types.Node": stream_types.Node,
38
+ "types.ProceduralContent": types.ProceduralContent,
39
+ "stream_types.ProceduralContent": stream_types.ProceduralContent,
40
+ "types.SummarizedClass": types.SummarizedClass,
41
+ "stream_types.SummarizedClass": stream_types.SummarizedClass,
42
+ "types.SummarizedCode": types.SummarizedCode,
43
+ "stream_types.SummarizedCode": stream_types.SummarizedCode,
44
+ "types.SummarizedContent": types.SummarizedContent,
45
+ "stream_types.SummarizedContent": stream_types.SummarizedContent,
46
+ "types.SummarizedFunction": types.SummarizedFunction,
47
+ "stream_types.SummarizedFunction": stream_types.SummarizedFunction,
48
+ "types.TextContent": types.TextContent,
49
+ "stream_types.TextContent": stream_types.TextContent,
50
+ "types.VideoContent": types.VideoContent,
51
+ "stream_types.VideoContent": stream_types.VideoContent,
52
+ }
@@ -0,0 +1,166 @@
1
+ # ----------------------------------------------------------------------------
2
+ #
3
+ # Welcome to Baml! To use this generated code, please run the following:
4
+ #
5
+ # $ pip install baml
6
+ #
7
+ # ----------------------------------------------------------------------------
8
+
9
+ # This file was generated by BAML: please do not edit it. Instead, edit the
10
+ # BAML files and re-generate this code using: baml-cli generate
11
+ # baml-cli is available with the baml package.
12
+
13
+ import typing
14
+ import typing_extensions
15
+ from enum import Enum
16
+
17
+
18
+ from pydantic import BaseModel, ConfigDict
19
+
20
+
21
+ import baml_py
22
+
23
+ CheckT = typing_extensions.TypeVar("CheckT")
24
+ CheckName = typing_extensions.TypeVar("CheckName", bound=str)
25
+
26
+
27
+ class Check(BaseModel):
28
+ name: str
29
+ expression: str
30
+ status: str
31
+
32
+
33
+ class Checked(BaseModel, typing.Generic[CheckT, CheckName]):
34
+ value: CheckT
35
+ checks: typing.Dict[CheckName, Check]
36
+
37
+
38
+ def get_checks(checks: typing.Dict[CheckName, Check]) -> typing.List[Check]:
39
+ return list(checks.values())
40
+
41
+
42
+ def all_succeeded(checks: typing.Dict[CheckName, Check]) -> bool:
43
+ return all(check.status == "succeeded" for check in get_checks(checks))
44
+
45
+
46
+ # #########################################################################
47
+ # Generated enums (0)
48
+ # #########################################################################
49
+
50
+ # #########################################################################
51
+ # Generated classes (17)
52
+ # #########################################################################
53
+
54
+
55
+ class AudioContent(BaseModel):
56
+ type: str
57
+ subclass: typing.List[str]
58
+
59
+
60
+ class ContentLabel(BaseModel):
61
+ content_type: typing.Union[
62
+ typing_extensions.Literal["text"],
63
+ typing_extensions.Literal["audio"],
64
+ typing_extensions.Literal["image"],
65
+ typing_extensions.Literal["video"],
66
+ typing_extensions.Literal["multimedia"],
67
+ typing_extensions.Literal["3d_model"],
68
+ typing_extensions.Literal["procedural"],
69
+ ]
70
+ type: str
71
+ subclass: typing.List[str]
72
+
73
+
74
+ class DefaultContentPrediction(BaseModel):
75
+ label: "ContentLabel"
76
+
77
+
78
+ class DynamicKnowledgeGraph(BaseModel):
79
+ model_config = ConfigDict(extra="allow")
80
+
81
+
82
+ class Edge(BaseModel):
83
+ # doc string for edge
84
+ # doc string for source_node_id
85
+
86
+ source_node_id: str
87
+ target_node_id: str
88
+ relationship_name: str
89
+
90
+
91
+ class ImageContent(BaseModel):
92
+ type: str
93
+ subclass: typing.List[str]
94
+
95
+
96
+ class KnowledgeGraph(BaseModel):
97
+ nodes: typing.List["Node"]
98
+ edges: typing.List["Edge"]
99
+
100
+
101
+ class Model3DContent(BaseModel):
102
+ type: str
103
+ subclass: typing.List[str]
104
+
105
+
106
+ class MultimediaContent(BaseModel):
107
+ type: str
108
+ subclass: typing.List[str]
109
+
110
+
111
+ class Node(BaseModel):
112
+ model_config = ConfigDict(extra="allow")
113
+ id: str
114
+ name: str
115
+ type: str
116
+ description: str
117
+
118
+
119
+ class ProceduralContent(BaseModel):
120
+ type: str
121
+ subclass: typing.List[str]
122
+
123
+
124
+ class SummarizedClass(BaseModel):
125
+ name: str
126
+ description: str
127
+ methods: typing.Optional[typing.List["SummarizedFunction"]] = None
128
+ decorators: typing.Optional[typing.List[str]] = None
129
+
130
+
131
+ class SummarizedCode(BaseModel):
132
+ high_level_summary: str
133
+ key_features: typing.List[str]
134
+ imports: typing.List[str]
135
+ constants: typing.List[str]
136
+ classes: typing.List["SummarizedClass"]
137
+ functions: typing.List["SummarizedFunction"]
138
+ workflow_description: typing.Optional[str] = None
139
+
140
+
141
+ class SummarizedContent(BaseModel):
142
+ summary: str
143
+ description: str
144
+
145
+
146
+ class SummarizedFunction(BaseModel):
147
+ name: str
148
+ description: str
149
+ inputs: typing.Optional[typing.List[str]] = None
150
+ outputs: typing.Optional[typing.List[str]] = None
151
+ decorators: typing.Optional[typing.List[str]] = None
152
+
153
+
154
+ class TextContent(BaseModel):
155
+ type: str
156
+ subclass: typing.List[str]
157
+
158
+
159
+ class VideoContent(BaseModel):
160
+ type: str
161
+ subclass: typing.List[str]
162
+
163
+
164
+ # #########################################################################
165
+ # Generated type aliases (0)
166
+ # #########################################################################
@@ -0,0 +1,109 @@
1
+ // Content classification data models - matching shared/data_models.py
2
+ class TextContent {
3
+ type string
4
+ subclass string[]
5
+ }
6
+
7
+ class AudioContent {
8
+ type string
9
+ subclass string[]
10
+ }
11
+
12
+ class ImageContent {
13
+ type string
14
+ subclass string[]
15
+ }
16
+
17
+ class VideoContent {
18
+ type string
19
+ subclass string[]
20
+ }
21
+
22
+ class MultimediaContent {
23
+ type string
24
+ subclass string[]
25
+ }
26
+
27
+ class Model3DContent {
28
+ type string
29
+ subclass string[]
30
+ }
31
+
32
+ class ProceduralContent {
33
+ type string
34
+ subclass string[]
35
+ }
36
+
37
+ class ContentLabel {
38
+ content_type "text" | "audio" | "image" | "video" | "multimedia" | "3d_model" | "procedural"
39
+ type string
40
+ subclass string[]
41
+ }
42
+
43
+ class DefaultContentPrediction {
44
+ label ContentLabel
45
+ }
46
+
47
+ // Content classification prompt template
48
+ template_string ClassifyContentPrompt() #"
49
+ You are a classification engine and should classify content. Make sure to use one of the existing classification options and not invent your own.
50
+
51
+ Classify the content into one of these main categories and their relevant subclasses:
52
+
53
+ **TEXT CONTENT** (content_type: "text"):
54
+ - type: "TEXTUAL_DOCUMENTS_USED_FOR_GENERAL_PURPOSES"
55
+ - subclass options: ["Articles, essays, and reports", "Books and manuscripts", "News stories and blog posts", "Research papers and academic publications", "Social media posts and comments", "Website content and product descriptions", "Personal narratives and stories", "Spreadsheets and tables", "Forms and surveys", "Databases and CSV files", "Source code in various programming languages", "Shell commands and scripts", "Markup languages (HTML, XML)", "Stylesheets (CSS) and configuration files (YAML, JSON, INI)", "Chat transcripts and messaging history", "Customer service logs and interactions", "Conversational AI training data", "Textbook content and lecture notes", "Exam questions and academic exercises", "E-learning course materials", "Poetry and prose", "Scripts for plays, movies, and television", "Song lyrics", "Manuals and user guides", "Technical specifications and API documentation", "Helpdesk articles and FAQs", "Contracts and agreements", "Laws, regulations, and legal case documents", "Policy documents and compliance materials", "Clinical trial reports", "Patient records and case notes", "Scientific journal articles", "Financial reports and statements", "Business plans and proposals", "Market research and analysis reports", "Ad copies and marketing slogans", "Product catalogs and brochures", "Press releases and promotional content", "Professional and formal correspondence", "Personal emails and letters", "Image and video captions", "Annotations and metadata for various media", "Vocabulary lists and grammar rules", "Language exercises and quizzes", "Other types of text data"]
56
+
57
+ **AUDIO CONTENT** (content_type: "audio"):
58
+ - type: "AUDIO_DOCUMENTS_USED_FOR_GENERAL_PURPOSES"
59
+ - subclass options: ["Music tracks and albums", "Podcasts and radio broadcasts", "Audiobooks and audio guides", "Recorded interviews and speeches", "Sound effects and ambient sounds", "Other types of audio recordings"]
60
+
61
+ **IMAGE CONTENT** (content_type: "image"):
62
+ - type: "IMAGE_DOCUMENTS_USED_FOR_GENERAL_PURPOSES"
63
+ - subclass options: ["Photographs and digital images", "Illustrations, diagrams, and charts", "Infographics and visual data representations", "Artwork and paintings", "Screenshots and graphical user interfaces", "Other types of images"]
64
+
65
+ **VIDEO CONTENT** (content_type: "video"):
66
+ - type: "VIDEO_DOCUMENTS_USED_FOR_GENERAL_PURPOSES"
67
+ - subclass options: ["Movies and short films", "Documentaries and educational videos", "Video tutorials and how-to guides", "Animated features and cartoons", "Live event recordings and sports broadcasts", "Other types of video content"]
68
+
69
+ **MULTIMEDIA CONTENT** (content_type: "multimedia"):
70
+ - type: "MULTIMEDIA_DOCUMENTS_USED_FOR_GENERAL_PURPOSES"
71
+ - subclass options: ["Interactive web content and games", "Virtual reality (VR) and augmented reality (AR) experiences", "Mixed media presentations and slide decks", "E-learning modules with integrated multimedia", "Digital exhibitions and virtual tours", "Other types of multimedia content"]
72
+
73
+ **3D MODEL CONTENT** (content_type: "3d_model"):
74
+ - type: "3D_MODEL_DOCUMENTS_USED_FOR_GENERAL_PURPOSES"
75
+ - subclass options: ["Architectural renderings and building plans", "Product design models and prototypes", "3D animations and character models", "Scientific simulations and visualizations", "Virtual objects for AR/VR applications", "Other types of 3D models"]
76
+
77
+ **PROCEDURAL CONTENT** (content_type: "procedural"):
78
+ - type: "PROCEDURAL_DOCUMENTS_USED_FOR_GENERAL_PURPOSES"
79
+ - subclass options: ["Tutorials and step-by-step guides", "Workflow and process descriptions", "Simulation and training exercises", "Recipes and crafting instructions", "Other types of procedural content"]
80
+
81
+ Select the most appropriate content_type, type, and relevant subclasses.
82
+ "#
83
+
84
+ // OpenAI client defined once for all BAML files
85
+
86
+ // Classification function
87
+ function ExtractCategories(content: string) -> DefaultContentPrediction {
88
+ client OpenAI
89
+
90
+ prompt #"
91
+ {{ ClassifyContentPrompt() }}
92
+
93
+ {{ ctx.output_format(prefix="Answer in this schema:\n") }}
94
+
95
+ {{ _.role('user') }}
96
+ {{ content }}
97
+ "#
98
+ }
99
+
100
+ // Test case for classification
101
+ test ExtractCategoriesExample {
102
+ functions [ExtractCategories]
103
+ args {
104
+ content #"
105
+ Natural language processing (NLP) is an interdisciplinary subfield of computer science and information retrieval.
106
+ It deals with the interaction between computers and human language, in particular how to program computers to process and analyze large amounts of natural language data.
107
+ "#
108
+ }
109
+ }
@@ -0,0 +1,343 @@
1
+ class Node {
2
+ id string
3
+ name string
4
+ type string
5
+ description string
6
+ @@dynamic
7
+ }
8
+
9
+ /// doc string for edge
10
+ class Edge {
11
+ /// doc string for source_node_id
12
+ source_node_id string
13
+ target_node_id string
14
+ relationship_name string
15
+ }
16
+
17
+ class KnowledgeGraph {
18
+ nodes (Node @stream.done)[]
19
+ edges Edge[]
20
+ }
21
+
22
+ // Summarization classes
23
+ class SummarizedContent {
24
+ summary string
25
+ description string
26
+ }
27
+
28
+ class SummarizedFunction {
29
+ name string
30
+ description string
31
+ inputs string[]?
32
+ outputs string[]?
33
+ decorators string[]?
34
+ }
35
+
36
+ class SummarizedClass {
37
+ name string
38
+ description string
39
+ methods SummarizedFunction[]?
40
+ decorators string[]?
41
+ }
42
+
43
+ class SummarizedCode {
44
+ high_level_summary string
45
+ key_features string[]
46
+ imports string[]
47
+ constants string[]
48
+ classes SummarizedClass[]
49
+ functions SummarizedFunction[]
50
+ workflow_description string?
51
+ }
52
+
53
+ class DynamicKnowledgeGraph {
54
+ @@dynamic
55
+ }
56
+
57
+
58
+ // Simple template for basic extraction (fast, good quality)
59
+ template_string ExtractContentGraphPrompt() #"
60
+ You are an advanced algorithm that extracts structured data into a knowledge graph.
61
+
62
+ - **Nodes**: Entities/concepts (like Wikipedia articles).
63
+ - **Edges**: Relationships (like Wikipedia links). Use snake_case (e.g., `acted_in`).
64
+
65
+ **Rules:**
66
+
67
+ 1. **Node Labeling & IDs**
68
+ - Use basic types only (e.g., "Person", "Date", "Organization").
69
+ - Avoid overly specific or generic terms (e.g., no "Mathematician" or "Entity").
70
+ - Node IDs must be human-readable names from the text (no numbers).
71
+
72
+ 2. **Dates & Numbers**
73
+ - Label dates as **"Date"** in "YYYY-MM-DD" format (use available parts if incomplete).
74
+ - Properties are key-value pairs; do not use escaped quotes.
75
+
76
+ 3. **Coreference Resolution**
77
+ - Use a single, complete identifier for each entity (e.g., always "John Doe" not "Joe" or "he").
78
+
79
+ 4. **Relationship Labels**:
80
+ - Use descriptive, lowercase, snake_case names for edges.
81
+ - *Example*: born_in, married_to, invented_by.
82
+ - Avoid vague or generic labels like isA, relatesTo, has.
83
+ - Avoid duplicated relationships like produces, produced by.
84
+
85
+ 5. **Strict Compliance**
86
+ - Follow these rules exactly. Non-compliance results in termination.
87
+ "#
88
+
89
+ // Summarization prompt template
90
+ template_string SummarizeContentPrompt() #"
91
+ You are a top-tier summarization engine. Your task is to summarize text and make it versatile.
92
+ Be brief and concise, but keep the important information and the subject.
93
+ Use synonym words where possible in order to change the wording but keep the meaning.
94
+ "#
95
+
96
+ // Code summarization prompt template
97
+ template_string SummarizeCodePrompt() #"
98
+ You are an expert code analyst. Analyze the provided source code and extract key information:
99
+
100
+ 1. Provide a high-level summary of what the code does
101
+ 2. List key features and functionality
102
+ 3. Identify imports and dependencies
103
+ 4. List constants and global variables
104
+ 5. Summarize classes with their methods
105
+ 6. Summarize standalone functions
106
+ 7. Describe the overall workflow if applicable
107
+
108
+ Be precise and technical while remaining clear and concise.
109
+ "#
110
+
111
+ // Detailed template for complex extraction (slower, higher quality)
112
+ template_string DetailedExtractContentGraphPrompt() #"
113
+ You are a top-tier algorithm designed for extracting information in structured formats to build a knowledge graph.
114
+ **Nodes** represent entities and concepts. They're akin to Wikipedia nodes.
115
+ **Edges** represent relationships between concepts. They're akin to Wikipedia links.
116
+
117
+ The aim is to achieve simplicity and clarity in the knowledge graph.
118
+
119
+ # 1. Labeling Nodes
120
+ **Consistency**: Ensure you use basic or elementary types for node labels.
121
+ - For example, when you identify an entity representing a person, always label it as **"Person"**.
122
+ - Avoid using more specific terms like "Mathematician" or "Scientist", keep those as "profession" property.
123
+ - Don't use too generic terms like "Entity".
124
+ **Node IDs**: Never utilize integers as node IDs.
125
+ - Node IDs should be names or human-readable identifiers found in the text.
126
+
127
+ # 2. Handling Numerical Data and Dates
128
+ - For example, when you identify an entity representing a date, make sure it has type **"Date"**.
129
+ - Extract the date in the format "YYYY-MM-DD"
130
+ - If not possible to extract the whole date, extract month or year, or both if available.
131
+ - **Property Format**: Properties must be in a key-value format.
132
+ - **Quotation Marks**: Never use escaped single or double quotes within property values.
133
+ - **Naming Convention**: Use snake_case for relationship names, e.g., `acted_in`.
134
+
135
+ # 3. Coreference Resolution
136
+ - **Maintain Entity Consistency**: When extracting entities, it's vital to ensure consistency.
137
+ If an entity, such as "John Doe", is mentioned multiple times in the text but is referred to by different names or pronouns (e.g., "Joe", "he"),
138
+ always use the most complete identifier for that entity throughout the knowledge graph. In this example, use "John Doe" as the Person's ID.
139
+ Remember, the knowledge graph should be coherent and easily understandable, so maintaining consistency in entity references is crucial.
140
+
141
+ # 4. Strict Compliance
142
+ Adhere to the rules strictly. Non-compliance will result in termination.
143
+ "#
144
+
145
+ // Guided template with step-by-step instructions
146
+ template_string GuidedExtractContentGraphPrompt() #"
147
+ You are an advanced algorithm designed to extract structured information to build a clean, consistent, and human-readable knowledge graph.
148
+
149
+ **Objective**:
150
+ - Nodes represent entities and concepts, similar to Wikipedia articles.
151
+ - Edges represent typed relationships between nodes, similar to Wikipedia hyperlinks.
152
+ - The graph must be clear, minimal, consistent, and semantically precise.
153
+
154
+ **Node Guidelines**:
155
+
156
+ 1. **Label Consistency**:
157
+ - Use consistent, basic types for all node labels.
158
+ - Do not switch between granular or vague labels for the same kind of entity.
159
+ - Pick one label for each category and apply it uniformly.
160
+ - Each entity type should be in a singular form and in a case of multiple words separated by whitespaces
161
+
162
+ 2. **Node Identifiers**:
163
+ - Node IDs must be human-readable and derived directly from the text.
164
+ - Prefer full names and canonical terms.
165
+ - Never use integers or autogenerated IDs.
166
+ - *Example*: Use "Marie Curie", "Theory of Evolution", "Google".
167
+
168
+ 3. **Coreference Resolution**:
169
+ - Maintain one consistent node ID for each real-world entity.
170
+ - Resolve aliases, acronyms, and pronouns to the most complete form.
171
+ - *Example*: Always use "John Doe" even if later referred to as "Doe" or "he".
172
+
173
+ **Edge Guidelines**:
174
+
175
+ 4. **Relationship Labels**:
176
+ - Use descriptive, lowercase, snake_case names for edges.
177
+ - *Example*: born_in, married_to, invented_by.
178
+ - Avoid vague or generic labels like isA, relatesTo, has.
179
+
180
+ 5. **Relationship Direction**:
181
+ - Edges must be directional and logically consistent.
182
+ - *Example*:
183
+ - "Marie Curie" —[born_in]→ "Warsaw"
184
+ - "Radioactivity" —[discovered_by]→ "Marie Curie"
185
+
186
+ **Compliance**:
187
+ Strict adherence to these guidelines is required. Any deviation will result in immediate termination of the task.
188
+ "#
189
+
190
+ // Strict template with zero-tolerance rules
191
+ template_string StrictExtractContentGraphPrompt() #"
192
+ You are a top-tier algorithm for **extracting structured information** from unstructured text to build a **knowledge graph**.
193
+
194
+ Your primary goal is to extract:
195
+ - **Nodes**: Representing **entities** and **concepts** (like Wikipedia nodes).
196
+ - **Edges**: Representing **relationships** between those concepts (like Wikipedia links).
197
+
198
+ The resulting knowledge graph must be **simple, consistent, and human-readable**.
199
+
200
+ ## 1. Node Labeling and Identification
201
+
202
+ ### Node Types
203
+ Use **basic atomic types** for node labels. Always prefer general types over specific roles or professions:
204
+ - "Person" for any human.
205
+ - "Organization" for companies, institutions, etc.
206
+ - "Location" for geographic or place entities.
207
+ - "Date" for any temporal expression.
208
+ - "Event" for historical or scheduled occurrences.
209
+ - "Work" for books, films, artworks, or research papers.
210
+ - "Concept" for abstract notions or ideas.
211
+
212
+ ### Node IDs
213
+ - Always assign **human-readable and unambiguous identifiers**.
214
+ - Never use numeric or autogenerated IDs.
215
+ - Prioritize **most complete form** of entity names for consistency.
216
+
217
+ ## 2. Relationship Handling
218
+ - Use **snake_case** for all relationship (edge) types.
219
+ - Keep relationship types semantically clear and consistent.
220
+ - Avoid vague relation names like "related_to" unless no better alternative exists.
221
+
222
+ ## 3. Strict Compliance
223
+ Follow all rules exactly. Any deviation may lead to rejection or incorrect graph construction.
224
+ "#
225
+
226
+ // OpenAI client with environment model selection
227
+ client<llm> OpenAI {
228
+ provider openai
229
+ options {
230
+ model client_registry.model
231
+ api_key client_registry.api_key
232
+ }
233
+ }
234
+
235
+
236
+
237
+ // Function that returns raw structured output (for custom objects - to be handled in Python)
238
+ function ExtractContentGraphGeneric(
239
+ content: string,
240
+ mode: "simple" | "base" | "guided" | "strict" | "custom"?,
241
+ custom_prompt_content: string?
242
+ ) -> KnowledgeGraph {
243
+ client OpenAI
244
+
245
+ prompt #"
246
+ {% if mode == "base" %}
247
+ {{ DetailedExtractContentGraphPrompt() }}
248
+ {% elif mode == "guided" %}
249
+ {{ GuidedExtractContentGraphPrompt() }}
250
+ {% elif mode == "strict" %}
251
+ {{ StrictExtractContentGraphPrompt() }}
252
+ {% elif mode == "custom" and custom_prompt_content %}
253
+ {{ custom_prompt_content }}
254
+ {% else %}
255
+ {{ ExtractContentGraphPrompt() }}
256
+ {% endif %}
257
+
258
+ {{ ctx.output_format(prefix="Answer in this schema:\n") }}
259
+
260
+ Before answering, briefly describe what you'll extract from the text, then provide the structured output.
261
+
262
+ Example format:
263
+ I'll extract the main entities and their relationships from this text...
264
+
265
+ { ... }
266
+
267
+ {{ _.role('user') }}
268
+ {{ content }}
269
+ "#
270
+ }
271
+
272
+ // Backward-compatible function specifically for KnowledgeGraph
273
+ function ExtractDynamicContentGraph(
274
+ content: string,
275
+ mode: "simple" | "base" | "guided" | "strict" | "custom"?,
276
+ custom_prompt_content: string?
277
+ ) -> DynamicKnowledgeGraph {
278
+ client OpenAI
279
+
280
+ prompt #"
281
+ {% if mode == "base" %}
282
+ {{ DetailedExtractContentGraphPrompt() }}
283
+ {% elif mode == "guided" %}
284
+ {{ GuidedExtractContentGraphPrompt() }}
285
+ {% elif mode == "strict" %}
286
+ {{ StrictExtractContentGraphPrompt() }}
287
+ {% elif mode == "custom" and custom_prompt_content %}
288
+ {{ custom_prompt_content }}
289
+ {% else %}
290
+ {{ ExtractContentGraphPrompt() }}
291
+ {% endif %}
292
+
293
+ {{ ctx.output_format(prefix="Answer in this schema:\n") }}
294
+
295
+ Before answering, briefly describe what you'll extract from the text, then provide the structured output.
296
+
297
+ Example format:
298
+ I'll extract the main entities and their relationships from this text...
299
+
300
+ { ... }
301
+
302
+ {{ _.role('user') }}
303
+ {{ content }}
304
+ "#
305
+ }
306
+
307
+
308
+ // Summarization functions
309
+ function SummarizeContent(content: string) -> SummarizedContent {
310
+ client OpenAI
311
+
312
+ prompt #"
313
+ {{ SummarizeContentPrompt() }}
314
+
315
+ {{ ctx.output_format(prefix="Answer in this schema:\n") }}
316
+
317
+ {{ _.role('user') }}
318
+ {{ content }}
319
+ "#
320
+ }
321
+
322
+ function SummarizeCode(content: string) -> SummarizedCode {
323
+ client OpenAI
324
+
325
+ prompt #"
326
+ {{ SummarizeCodePrompt() }}
327
+
328
+ {{ ctx.output_format(prefix="Answer in this schema:\n") }}
329
+
330
+ {{ _.role('user') }}
331
+ {{ content }}
332
+ "#
333
+ }
334
+
335
+ test ExtractStrictExample {
336
+ functions [ExtractContentGraphGeneric]
337
+ args {
338
+ content #"
339
+ The Python programming language was created by Guido van Rossum in 1991.
340
+ "#
341
+ mode "strict"
342
+ }
343
+ }
@@ -1 +1,2 @@
1
1
  from .knowledge_graph.extract_content_graph import extract_content_graph
2
+ from .extract_summary import extract_summary, extract_code_summary