@pentatonic-ai/ai-agent-sdk 0.6.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. package/README.md +170 -69
  2. package/bin/__tests__/callback-server.test.js +4 -1
  3. package/bin/cli.js +41 -164
  4. package/bin/commands/config.js +251 -0
  5. package/package.json +2 -1
  6. package/packages/doctor/__tests__/detect.test.js +2 -6
  7. package/packages/doctor/src/checks/local-memory.js +164 -196
  8. package/packages/doctor/src/detect.js +11 -3
  9. package/packages/memory/src/corpus/adapters.js +104 -0
  10. package/packages/memory/src/corpus/cli.js +72 -7
  11. package/packages/memory/src/corpus/index.js +1 -1
  12. package/packages/memory-engine/.env.example +13 -0
  13. package/packages/memory-engine/README.md +131 -0
  14. package/packages/memory-engine/bench/README.md +99 -0
  15. package/packages/memory-engine/bench/scorecards-engine/agent-coding__pentatonic-baseline__20260427-142523.json +1115 -0
  16. package/packages/memory-engine/bench/scorecards-engine/chat-recall__pentatonic-baseline__20260427-142648.json +819 -0
  17. package/packages/memory-engine/bench/scorecards-engine/circular-economy__pentatonic-baseline__20260427-142757.json +1278 -0
  18. package/packages/memory-engine/bench/scorecards-engine/customer-support__pentatonic-baseline__20260427-142900.json +1018 -0
  19. package/packages/memory-engine/bench/scorecards-engine/marketplace-ops__pentatonic-baseline__20260427-142957.json +1038 -0
  20. package/packages/memory-engine/bench/scorecards-engine/product-catalogue__pentatonic-baseline__20260427-143122.json +961 -0
  21. package/packages/memory-engine/bench/scorecards-engine-via-docker/agent-coding__pentatonic-memory__20260427-161812.json +1115 -0
  22. package/packages/memory-engine/bench/scorecards-engine-via-docker/chat-recall__pentatonic-memory__20260427-161701.json +819 -0
  23. package/packages/memory-engine/bench/scorecards-engine-via-docker/circular-economy__pentatonic-memory__20260427-161713.json +1278 -0
  24. package/packages/memory-engine/bench/scorecards-engine-via-docker/customer-support__pentatonic-memory__20260427-161723.json +1018 -0
  25. package/packages/memory-engine/bench/scorecards-engine-via-docker/marketplace-ops__pentatonic-memory__20260427-161732.json +1038 -0
  26. package/packages/memory-engine/bench/scorecards-engine-via-docker/product-catalogue__pentatonic-memory__20260427-161741.json +937 -0
  27. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/agent-coding__pentatonic-memory__20260427-184718.json +1115 -0
  28. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/chat-recall__pentatonic-memory__20260427-184614.json +819 -0
  29. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/circular-economy__pentatonic-memory__20260427-184809.json +1278 -0
  30. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/customer-support__pentatonic-memory__20260427-184854.json +1018 -0
  31. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/marketplace-ops__pentatonic-memory__20260427-184929.json +1038 -0
  32. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/product-catalogue__pentatonic-memory__20260427-185015.json +961 -0
  33. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/agent-coding__pentatonic-memory__20260427-175252.json +1115 -0
  34. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/chat-recall__pentatonic-memory__20260427-175312.json +819 -0
  35. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/circular-economy__pentatonic-memory__20260427-175335.json +1278 -0
  36. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/customer-support__pentatonic-memory__20260427-175355.json +1018 -0
  37. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/marketplace-ops__pentatonic-memory__20260427-175413.json +1038 -0
  38. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/product-catalogue__pentatonic-memory__20260427-175430.json +883 -0
  39. package/packages/memory-engine/bench/scorecards-engine-via-shim/agent-coding__pentatonic-memory__20260427-155409.json +1115 -0
  40. package/packages/memory-engine/bench/scorecards-engine-via-shim/chat-recall__pentatonic-memory__20260427-155421.json +819 -0
  41. package/packages/memory-engine/bench/scorecards-engine-via-shim/circular-economy__pentatonic-memory__20260427-155433.json +1278 -0
  42. package/packages/memory-engine/bench/scorecards-engine-via-shim/customer-support__pentatonic-memory__20260427-155443.json +1018 -0
  43. package/packages/memory-engine/bench/scorecards-engine-via-shim/marketplace-ops__pentatonic-memory__20260427-155453.json +1038 -0
  44. package/packages/memory-engine/bench/scorecards-engine-via-shim/product-catalogue__pentatonic-memory__20260427-155503.json +937 -0
  45. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/agent-coding__pentatonic-memory-latest__20260427-145103.json +1115 -0
  46. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/agent-coding__pentatonic-memory__20260427-144909.json +1115 -0
  47. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/chat-recall__pentatonic-memory-latest__20260427-145153.json +819 -0
  48. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/chat-recall__pentatonic-memory__20260427-145120.json +542 -0
  49. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/circular-economy__pentatonic-memory-latest__20260427-145313.json +1278 -0
  50. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/circular-economy__pentatonic-memory__20260427-145207.json +894 -0
  51. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/customer-support__pentatonic-memory-latest__20260427-145412.json +1018 -0
  52. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/customer-support__pentatonic-memory__20260427-145327.json +680 -0
  53. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/marketplace-ops__pentatonic-memory-latest__20260427-145517.json +1038 -0
  54. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/marketplace-ops__pentatonic-memory__20260427-145422.json +693 -0
  55. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/product-catalogue__pentatonic-memory-latest__20260427-145616.json +961 -0
  56. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/product-catalogue__pentatonic-memory__20260427-145528.json +727 -0
  57. package/packages/memory-engine/compat/Dockerfile +11 -0
  58. package/packages/memory-engine/compat/server.py +680 -0
  59. package/packages/memory-engine/docker-compose.yml +243 -0
  60. package/packages/memory-engine/docs/MIGRATION.md +178 -0
  61. package/packages/memory-engine/docs/RUNBOOK-AWS.md +375 -0
  62. package/packages/memory-engine/docs/why-v05-underperforms.md +138 -0
  63. package/packages/memory-engine/engine/README.md +52 -0
  64. package/packages/memory-engine/engine/l2-hybridrag-proxy.py +1543 -0
  65. package/packages/memory-engine/engine/l5-comms-layer.py +663 -0
  66. package/packages/memory-engine/engine/l6-document-store.py +1018 -0
  67. package/packages/memory-engine/engine/services/l2/Dockerfile +41 -0
  68. package/packages/memory-engine/engine/services/l2/init_databases.py +81 -0
  69. package/packages/memory-engine/engine/services/l2/l2-hybridrag-proxy.py +1543 -0
  70. package/packages/memory-engine/engine/services/l4/Dockerfile +15 -0
  71. package/packages/memory-engine/engine/services/l4/server.py +235 -0
  72. package/packages/memory-engine/engine/services/l5/Dockerfile +9 -0
  73. package/packages/memory-engine/engine/services/l5/l5-comms-layer.py +678 -0
  74. package/packages/memory-engine/engine/services/l6/Dockerfile +11 -0
  75. package/packages/memory-engine/engine/services/l6/l6-document-store.py +1016 -0
  76. package/packages/memory-engine/engine/services/nv-embed/Dockerfile +28 -0
  77. package/packages/memory-engine/engine/services/nv-embed/server.py +152 -0
  78. package/packages/memory-engine/pme_memory/__init__.py +0 -0
  79. package/packages/memory-engine/pme_memory/__main__.py +129 -0
  80. package/packages/memory-engine/pme_memory/artifacts.py +95 -0
  81. package/packages/memory-engine/pme_memory/embed.py +74 -0
  82. package/packages/memory-engine/pme_memory/health.py +36 -0
  83. package/packages/memory-engine/pme_memory/hygiene.py +159 -0
  84. package/packages/memory-engine/pme_memory/indexer.py +200 -0
  85. package/packages/memory-engine/pme_memory/needs.py +55 -0
  86. package/packages/memory-engine/pme_memory/provenance.py +80 -0
  87. package/packages/memory-engine/pme_memory/scoring.py +168 -0
  88. package/packages/memory-engine/pme_memory/search.py +52 -0
  89. package/packages/memory-engine/pme_memory/store.py +86 -0
  90. package/packages/memory-engine/pme_memory/synthesis.py +114 -0
  91. package/packages/memory-engine/pyproject.toml +65 -0
  92. package/packages/memory-engine/scripts/kg-extractor.py +557 -0
  93. package/packages/memory-engine/scripts/kg-preflexor-v2.py +738 -0
  94. package/packages/memory-engine/tests/test_api_contract.sh +57 -0
@@ -0,0 +1,557 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Knowledge Graph Extractor for Pentatonic Memory Engine
4
+ Extracts entities and relationships from memory files and writes them to Neo4j.
5
+
6
+ Usage:
7
+ python3 kg-extractor.py # Process all new/modified files
8
+ python3 kg-extractor.py --file path # Process single file
9
+ python3 kg-extractor.py --stats # Show extraction stats
10
+ python3 kg-extractor.py --dry-run # Show what would be extracted
11
+ """
12
+
13
+ import argparse
14
+ import json
15
+ import os
16
+ import re
17
+ import sys
18
+ import time
19
+ from datetime import datetime
20
+ from pathlib import Path
21
+ from typing import Dict, List, Set, Tuple, Optional
22
+
23
+ # Try to import spaCy, fall back to regex if unavailable
24
+ try:
25
+ import spacy
26
+ try:
27
+ nlp = spacy.load("en_core_web_sm")
28
+ SPACY_AVAILABLE = True
29
+ print("Using spaCy NER for entity extraction")
30
+ except OSError:
31
+ print("spaCy model 'en_core_web_sm' not found, falling back to regex patterns")
32
+ SPACY_AVAILABLE = False
33
+ nlp = None
34
+ except ImportError:
35
+ print("spaCy not available, falling back to regex patterns")
36
+ SPACY_AVAILABLE = False
37
+ nlp = None
38
+
39
+ try:
40
+ from neo4j import GraphDatabase
41
+ except ImportError:
42
+ GraphDatabase = None
43
+
44
+ # Neo4j connection settings
45
+ NEO4J_URI = os.environ.get("NEO4J_URI", "bolt://localhost:7687")
46
+ NEO4J_USER = "neo4j"
47
+ NEO4J_PASSWORD = os.environ.get("NEO4J_PASSWORD", "")
48
+
49
+ # File paths to process
50
+ MEMORY_PATHS = [
51
+ os.path.join(os.environ.get("PME_DIR", os.path.expanduser("~/pentatonic")), "memory/people/*.md"),
52
+ os.path.join(os.environ.get("PME_DIR", os.path.expanduser("~/pentatonic")), "memory/daily/*.md"),
53
+ os.path.join(os.environ.get("PME_DIR", os.path.expanduser("~/pentatonic")), "memory/projects/*.md"),
54
+ os.path.join(os.environ.get("PME_DIR", os.path.expanduser("~/pentatonic")), "memory/your_project/*.md"),
55
+ os.path.join(os.environ.get("PME_DIR", os.path.expanduser("~/pentatonic")), "memory/your_company/*.md"),
56
+ os.path.join(os.environ.get("PME_DIR", os.path.expanduser("~/pentatonic")), "memory/tools/*.md"),
57
+ os.path.join(os.environ.get("PME_DIR", os.path.expanduser("~/pentatonic")), "memory/rules/*.md"),
58
+ os.path.join(os.environ.get("PME_DIR", os.path.expanduser("~/pentatonic")), "memory/security/*.md"),
59
+ os.path.join(os.environ.get("PME_DIR", os.path.expanduser("~/pentatonic")), "memory/research/*.md"),
60
+ os.path.join(os.environ.get("PME_DIR", os.path.expanduser("~/pentatonic")), "memory/reviews/*.md"),
61
+ os.path.join(os.environ.get("PME_DIR", os.path.expanduser("~/pentatonic")), "memory/health/*.md"),
62
+ os.path.join(os.environ.get("PME_DIR", os.path.expanduser("~/pentatonic")), "memory/contacts/*.md"),
63
+ os.path.join(os.environ.get("PME_DIR", os.path.expanduser("~/pentatonic")), "memory/messages/*.md"),
64
+ os.path.join(os.environ.get("PME_DIR", os.path.expanduser("~/pentatonic")), "memory/email/*.md"),
65
+ os.path.join(os.environ.get("PME_DIR", os.path.expanduser("~/pentatonic")), "memory/linkedin/*.md"),
66
+ os.path.join(os.environ.get("PME_DIR", os.path.expanduser("~/pentatonic")), "memory/KNOWLEDGE_BASE/*.md"),
67
+ os.path.join(os.environ.get("PME_DIR", os.path.expanduser("~/pentatonic")), "memory/slack/*.md"),
68
+ os.path.join(os.environ.get("PME_DIR", os.path.expanduser("~/pentatonic")), "MEMORY.md"),
69
+ ]
70
+
71
+ # State file to track processed files
72
+ STATE_FILE = os.path.expanduser("~/.pme/kg-extractor-state.json")
73
+
74
+ # Regex patterns for fallback entity extraction
75
+ ENTITY_PATTERNS = {
76
+ "PERSON": [
77
+ r'\b[A-Z][a-z]+ [A-Z][a-z]+(?:\s[A-Z][a-z]+)*\b', # Full names only
78
+ r'\b(?:YOUR_HUMAN|CONTACT_1|CONTACT_2|CONTACT_3)\b' # Known first names from context
79
+ ],
80
+ "ORG": [
81
+ r'\b(?:Google|Microsoft|Apple|Amazon|Meta|Tesla|NVIDIA|OpenAI|Anthropic|DeepMind|YOUR_COMPANY|YOUR_PROJECT|Acme|Globex|Initech|Umbrella)\b',
82
+ r'\b[A-Z][a-zA-Z\s]+(?:Ltd|Inc|Corp|Company|Group|Technologies|Systems|Solutions|Labs|University|College|Institute)\b'
83
+ ],
84
+ "PROJECT": [
85
+ r'\b[A-Z][a-zA-Z\s]+(?:Project|Platform|System|Framework|Tool|API|App)\b',
86
+ r'\b(?:Phase|Sprint|Build|LACG|Neo)\s+\d+\b'
87
+ ],
88
+ "TECHNOLOGY": [
89
+ r'\b(?:Python|JavaScript|React|Node\.js|Docker|Kubernetes|AWS|GCP|Azure|PostgreSQL|MongoDB|Redis|Neo4j|spaCy)\b',
90
+ r'\b[a-z]+\.[a-z]+(?:\.[a-z]+)*\b' # domain names
91
+ ]
92
+ }
93
+
94
+ # Relationship patterns
95
+ RELATIONSHIP_PATTERNS = {
96
+ "WORKS_AT": [
97
+ r'(\w+(?:\s+\w+)*)\s+(?:works\s+(?:at|for)|is\s+(?:at|with)|employed\s+(?:at|by))\s+(\w+(?:\s+\w+)*)',
98
+ r'(\w+(?:\s+\w+)*)\s+(?:@|at)\s+(\w+(?:\s+\w+)*)'
99
+ ],
100
+ "MARRIED_TO": [
101
+ r'(\w+(?:\s+\w+)*)\s+(?:married\s+to|wife\s+of|husband\s+of|spouse\s+of)\s+(\w+(?:\s+\w+)*)',
102
+ r'(\w+(?:\s+\w+)*)\s+and\s+(\w+(?:\s+\w+)*)\s+(?:are\s+)?married'
103
+ ],
104
+ "FRIEND_OF": [
105
+ r'(\w+(?:\s+\w+)*)\s+(?:is\s+(?:friends?\s+with|a\s+friend\s+of|mates?\s+with)|knows)\s+(\w+(?:\s+\w+)*)',
106
+ r'(\w+(?:\s+\w+)*)\s+(?:friend|mate|buddy)\s+(\w+(?:\s+\w+)*)'
107
+ ],
108
+ "KNOWS_PERSON": [
109
+ r'(\w+(?:\s+\w+)*)\s+(?:knows|met|connected\s+with|introduced\s+to)\s+(\w+(?:\s+\w+)*)',
110
+ r'(\w+(?:\s+\w+)*)\s+and\s+(\w+(?:\s+\w+)*)\s+(?:know\s+each\s+other|are\s+connected)'
111
+ ],
112
+ "WORKS_ON": [
113
+ r'(\w+(?:\s+\w+)*)\s+(?:works\s+on|building|developing|created|maintains)\s+(\w+(?:\s+\w+)*)',
114
+ r'(\w+(?:\s+\w+)*)\s+(?:is\s+(?:working\s+on|building|developing))\s+(\w+(?:\s+\w+)*)'
115
+ ]
116
+ }
117
+
118
+ class KGExtractor:
119
+ def __init__(self):
120
+ self.driver = None
121
+ self.stats = {
122
+ "files_processed": 0,
123
+ "entities_extracted": 0,
124
+ "relationships_extracted": 0,
125
+ "entities_created": 0,
126
+ "relationships_created": 0,
127
+ "errors": 0
128
+ }
129
+
130
+ def connect_neo4j(self):
131
+ """Connect to Neo4j database"""
132
+ try:
133
+ self.driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))
134
+ # Test connection
135
+ with self.driver.session() as session:
136
+ session.run("RETURN 1")
137
+ print(f"Connected to Neo4j at {NEO4J_URI}")
138
+ except Exception as e:
139
+ print(f"Failed to connect to Neo4j: {e}")
140
+ return False
141
+ return True
142
+
143
+ def close_neo4j(self):
144
+ """Close Neo4j connection"""
145
+ if self.driver:
146
+ self.driver.close()
147
+
148
+ def load_state(self) -> Dict:
149
+ """Load processing state from file"""
150
+ if os.path.exists(STATE_FILE):
151
+ try:
152
+ with open(STATE_FILE, 'r') as f:
153
+ return json.load(f)
154
+ except Exception as e:
155
+ print(f"Error loading state file: {e}")
156
+ return {}
157
+
158
+ def save_state(self, state: Dict):
159
+ """Save processing state to file"""
160
+ os.makedirs(os.path.dirname(STATE_FILE), exist_ok=True)
161
+ try:
162
+ with open(STATE_FILE, 'w') as f:
163
+ json.dump(state, f, indent=2)
164
+ except Exception as e:
165
+ print(f"Error saving state file: {e}")
166
+
167
+ def extract_entities_spacy(self, text: str) -> List[Tuple[str, str]]:
168
+ """Extract entities using spaCy NER"""
169
+ entities = []
170
+ if not SPACY_AVAILABLE:
171
+ return entities
172
+
173
+ doc = nlp(text)
174
+ for ent in doc.ents:
175
+ if ent.label_ in ["PERSON", "ORG", "GPE", "PRODUCT", "EVENT", "WORK_OF_ART"]:
176
+ entity_type = self._map_spacy_label(ent.label_)
177
+ if self._is_valid_entity(ent.text, entity_type):
178
+ entities.append((ent.text.strip(), entity_type))
179
+
180
+ return entities
181
+
182
+ def extract_entities_regex(self, text: str) -> List[Tuple[str, str]]:
183
+ """Extract entities using regex patterns"""
184
+ entities = []
185
+ for entity_type, patterns in ENTITY_PATTERNS.items():
186
+ for pattern in patterns:
187
+ matches = re.finditer(pattern, text, re.IGNORECASE)
188
+ for match in matches:
189
+ entity_name = match.group().strip()
190
+ if self._is_valid_entity(entity_name, entity_type):
191
+ entities.append((entity_name, entity_type))
192
+
193
+ return entities
194
+
195
+ def extract_relationships(self, text: str) -> List[Tuple[str, str, str, str]]:
196
+ """Extract relationships using pattern matching"""
197
+ relationships = []
198
+ for rel_type, patterns in RELATIONSHIP_PATTERNS.items():
199
+ for pattern in patterns:
200
+ matches = re.finditer(pattern, text, re.IGNORECASE)
201
+ for match in matches:
202
+ if len(match.groups()) >= 2:
203
+ entity1 = match.group(1).strip()
204
+ entity2 = match.group(2).strip()
205
+ if self._is_valid_entity(entity1, "PERSON") and self._is_valid_entity(entity2, None):
206
+ extracted_text = match.group().strip()
207
+ relationships.append((entity1, rel_type, entity2, extracted_text))
208
+
209
+ return relationships
210
+
211
+ def _map_spacy_label(self, spacy_label: str) -> str:
212
+ """Map spaCy entity labels to our schema"""
213
+ mapping = {
214
+ "PERSON": "PERSON",
215
+ "ORG": "ORG",
216
+ "GPE": "ORG", # Geopolitical entity -> organization
217
+ "PRODUCT": "PROJECT",
218
+ "EVENT": "PROJECT",
219
+ "WORK_OF_ART": "PROJECT"
220
+ }
221
+ return mapping.get(spacy_label, "UNKNOWN")
222
+
223
+ def _is_valid_entity(self, text: str, entity_type: Optional[str]) -> bool:
224
+ """Check if extracted text is a valid entity"""
225
+ text = text.strip()
226
+
227
+ # Skip too short or too long
228
+ if len(text) < 2 or len(text) > 100:
229
+ return False
230
+
231
+ # Skip common words and technical terms
232
+ skip_words = {
233
+ "the", "and", "or", "but", "in", "on", "at", "to", "for", "of", "with", "by",
234
+ "is", "are", "was", "were", "been", "be", "have", "has", "had", "do", "does", "did",
235
+ "will", "would", "could", "should", "can", "may", "might", "must",
236
+ "this", "that", "these", "those", "here", "there", "where", "when", "why", "how",
237
+ "what", "which", "who", "whom", "whose", "all", "any", "some", "many", "much",
238
+ "more", "most", "less", "least", "few", "several", "each", "every", "both",
239
+ "either", "neither", "not", "no", "yes", "true", "false", "good", "bad", "new", "old",
240
+ "basic", "info", "role", "email", "communication", "professional", "relationship",
241
+ "part", "quick", "meeting", "deep", "analysis", "context", "style", "leadership"
242
+ }
243
+
244
+ if text.lower() in skip_words:
245
+ return False
246
+
247
+ # Skip garbage patterns from spaCy misclassification
248
+ garbage_patterns = [
249
+ r'^[A-Z]{2,}$', # ALL CAPS single words like "CRITICAL", "NEGLIGIBLE", "IDENTIFY"
250
+ r'[\n\r]', # Contains newlines
251
+ r'^[^a-zA-Z]*$', # No letters at all
252
+ r'^\d', # Starts with digit
253
+ r'^(What|How|When|Where|Why|Which|If|Can|Do|Does|Did|Is|Are|Was|Were|Has|Have|Had|Will|Would|Could|Should|May|Might|Must|Let|After|Before|During)\b', # Starts with question/auxiliary word
254
+ r'^(No |Not |Any |All |Some |Each |Every |Both |Either |Neither |Very |Too |Just )', # Starts with determiner/adverb
255
+ ]
256
+ for pattern in garbage_patterns:
257
+ if re.match(pattern, text):
258
+ return False
259
+
260
+ # Skip very short entities (< 2 chars)
261
+ if len(text.strip()) < 2:
262
+ return False
263
+
264
+ # Skip entities that look like markdown/code artifacts
265
+ if any(c in text for c in ['*', '`', '#', '|', '{', '}', '[', ']']):
266
+ return False
267
+
268
+ # Skip if starts with common prefixes that indicate it's not a proper entity
269
+ skip_prefixes = ["role at", "part of", "email", "professional", "quick", "manages",
270
+ "accepts", "invites", "handles", "respects", "active on", "shares",
271
+ "organises", "authors", "guest", "references to", "with", "on", "and",
272
+ "for", "to", "the", "basic info", "communication style", "deep analysis",
273
+ "- active"]
274
+
275
+ for prefix in skip_prefixes:
276
+ if text.lower().startswith(prefix):
277
+ return False
278
+
279
+ # Must contain at least one letter
280
+ if not re.search(r'[a-zA-Z]', text):
281
+ return False
282
+
283
+ # For person names, require proper capitalization and at least 2 words for full names
284
+ if entity_type == "PERSON":
285
+ # Must start with capital letter
286
+ if not text[0].isupper():
287
+ return False
288
+ # If contains space, should be proper name format
289
+ if " " in text and not re.match(r'^[A-Z][a-z]+(?:\s[A-Z][a-z]+)+$', text):
290
+ return False
291
+ # Single names should be common first names
292
+ if " " not in text and len(text) < 3:
293
+ return False
294
+
295
+ # For organizations, should contain meaningful words
296
+ if entity_type == "ORG":
297
+ if not re.search(r'[A-Z]', text):
298
+ return False
299
+
300
+ return True
301
+
302
+ def entity_exists(self, entity_name: str, entity_type: str) -> Optional[str]:
303
+ """Check if entity already exists (case-insensitive fuzzy match)"""
304
+ if not self.driver:
305
+ return None
306
+
307
+ label_map = {
308
+ "PERSON": "Person",
309
+ "ORG": "Company",
310
+ "PROJECT": "Project",
311
+ "TECHNOLOGY": "Tool",
312
+ }
313
+ label = label_map.get(entity_type, "Entity")
314
+
315
+ with self.driver.session() as session:
316
+ # Exact match first (by label)
317
+ result = session.run(
318
+ f"MATCH (n:{label}) WHERE toLower(n.name) = toLower($name) RETURN n.name",
319
+ name=entity_name
320
+ )
321
+
322
+ record = result.single()
323
+ if record:
324
+ return record["n.name"]
325
+
326
+ # Also check across all labels (entity may have been created with different type)
327
+ result = session.run(
328
+ "MATCH (n) WHERE toLower(n.name) = toLower($name) RETURN n.name",
329
+ name=entity_name
330
+ )
331
+
332
+ record = result.single()
333
+ if record:
334
+ return record["n.name"]
335
+
336
+ return None
337
+
338
+ def create_entity(self, name: str, entity_type: str, source_file: str, dry_run: bool = False) -> str:
339
+ """Create or merge entity in Neo4j"""
340
+ if dry_run:
341
+ print(f" [DRY] Would create entity: {name} ({entity_type}) from {source_file}")
342
+ return name
343
+
344
+ # Check if exists
345
+ existing = self.entity_exists(name, entity_type)
346
+ if existing:
347
+ return existing
348
+
349
+ if not self.driver:
350
+ return name
351
+
352
+ # Map entity type to Neo4j label
353
+ label_map = {
354
+ "PERSON": "Person",
355
+ "ORG": "Company",
356
+ "PROJECT": "Project",
357
+ "TECHNOLOGY": "Tool",
358
+ }
359
+ label = label_map.get(entity_type, "Entity")
360
+
361
+ with self.driver.session() as session:
362
+ session.run(
363
+ f"MERGE (n:{label} {{name: $name}}) SET n.type = $type, n.source_file = $source_file, n.created_at = datetime()",
364
+ name=name, type=entity_type, source_file=source_file
365
+ )
366
+ self.stats["entities_created"] += 1
367
+
368
+ return name
369
+
370
+ def create_relationship(self, entity1: str, rel_type: str, entity2: str, source_file: str, extracted_from: str, dry_run: bool = False):
371
+ """Create relationship between entities"""
372
+ if dry_run:
373
+ print(f" [DRY] Would create relationship: {entity1} -> {rel_type} -> {entity2} from {source_file}")
374
+ return
375
+
376
+ if not self.driver:
377
+ return
378
+
379
+ with self.driver.session() as session:
380
+ session.run("""
381
+ MATCH (a {name: $entity1}), (b {name: $entity2})
382
+ MERGE (a)-[r:RELATIONSHIP {type: $rel_type}]->(b)
383
+ SET r.source_file = $source_file, r.extracted_from = $extracted_from, r.created_at = datetime()
384
+ """,
385
+ entity1=entity1, entity2=entity2, rel_type=rel_type,
386
+ source_file=source_file, extracted_from=extracted_from
387
+ )
388
+ self.stats["relationships_created"] += 1
389
+
390
+ def process_file(self, file_path: str, dry_run: bool = False) -> bool:
391
+ """Process a single file for entity and relationship extraction"""
392
+ try:
393
+ print(f"Processing: {file_path}")
394
+
395
+ with open(file_path, 'r', encoding='utf-8') as f:
396
+ content = f.read()
397
+
398
+ # Extract entities
399
+ if SPACY_AVAILABLE:
400
+ entities = self.extract_entities_spacy(content)
401
+ else:
402
+ entities = self.extract_entities_regex(content)
403
+
404
+ # Also try regex for additional coverage
405
+ regex_entities = self.extract_entities_regex(content)
406
+ entities.extend(regex_entities)
407
+
408
+ # Remove duplicates
409
+ entities = list(set(entities))
410
+
411
+ # Extract relationships
412
+ relationships = self.extract_relationships(content)
413
+
414
+ print(f" Found {len(entities)} entities, {len(relationships)} relationships")
415
+
416
+ # Create entities
417
+ entity_names = {}
418
+ for entity_name, entity_type in entities:
419
+ canonical_name = self.create_entity(entity_name, entity_type, file_path, dry_run)
420
+ entity_names[entity_name] = canonical_name
421
+ self.stats["entities_extracted"] += 1
422
+
423
+ # Create relationships
424
+ for entity1, rel_type, entity2, extracted_text in relationships:
425
+ # Use canonical names if available
426
+ canonical_entity1 = entity_names.get(entity1, entity1)
427
+ canonical_entity2 = entity_names.get(entity2, entity2)
428
+
429
+ self.create_relationship(canonical_entity1, rel_type, canonical_entity2,
430
+ file_path, extracted_text, dry_run)
431
+ self.stats["relationships_extracted"] += 1
432
+
433
+ self.stats["files_processed"] += 1
434
+ return True
435
+
436
+ except Exception as e:
437
+ print(f"Error processing {file_path}: {e}")
438
+ self.stats["errors"] += 1
439
+ return False
440
+
441
+ def get_files_to_process(self, force_all: bool = False) -> List[str]:
442
+ """Get list of files that need processing"""
443
+ import glob
444
+
445
+ state = self.load_state()
446
+ files_to_process = []
447
+
448
+ for pattern in MEMORY_PATHS:
449
+ for file_path in glob.glob(pattern):
450
+ if os.path.isfile(file_path):
451
+ # Check if file is new or modified
452
+ mtime = os.path.getmtime(file_path)
453
+ last_processed = state.get(file_path, 0)
454
+
455
+ if force_all or mtime > last_processed:
456
+ files_to_process.append(file_path)
457
+
458
+ return sorted(files_to_process)
459
+
460
+ def update_state(self, file_path: str):
461
+ """Update state after processing a file"""
462
+ state = self.load_state()
463
+ state[file_path] = os.path.getmtime(file_path)
464
+ self.save_state(state)
465
+
466
+ def show_stats(self):
467
+ """Show extraction statistics"""
468
+ if not self.driver:
469
+ print("No Neo4j connection to show stats")
470
+ return
471
+
472
+ with self.driver.session() as session:
473
+ # Count entities by type
474
+ result = session.run("MATCH (n) RETURN n.type as type, count(*) as count ORDER BY count DESC")
475
+
476
+ print("\n=== Knowledge Graph Statistics ===")
477
+ print("\nEntities by type:")
478
+ for record in result:
479
+ print(f" {record['type']}: {record['count']}")
480
+
481
+ # Count relationships by type
482
+ result = session.run("MATCH ()-[r]->() WHERE r.type IS NOT NULL RETURN r.type as type, count(*) as count ORDER BY count DESC")
483
+
484
+ print("\nRelationships by type:")
485
+ for record in result:
486
+ print(f" {record['type']}: {record['count']}")
487
+
488
+ # Show recent extractions
489
+ result = session.run("MATCH (n) WHERE n.created_at IS NOT NULL RETURN n.name, n.type, n.created_at ORDER BY n.created_at DESC LIMIT 10")
490
+
491
+ print("\nRecently created entities:")
492
+ for record in result:
493
+ print(f" {record['n.name']} ({record['n.type']}) - {record['n.created_at']}")
494
+
495
+ def main():
496
+ parser = argparse.ArgumentParser(description="Extract knowledge graph from memory files")
497
+ parser.add_argument("--file", help="Process single file")
498
+ parser.add_argument("--stats", action="store_true", help="Show extraction statistics")
499
+ parser.add_argument("--dry-run", action="store_true", help="Show what would be extracted without writing to Neo4j")
500
+ parser.add_argument("--force", action="store_true", help="Force reprocess all files")
501
+
502
+ args = parser.parse_args()
503
+
504
+ extractor = KGExtractor()
505
+
506
+ if args.stats:
507
+ if extractor.connect_neo4j():
508
+ extractor.show_stats()
509
+ extractor.close_neo4j()
510
+ return
511
+
512
+ # Connect to Neo4j (unless dry run)
513
+ if not args.dry_run:
514
+ if not extractor.connect_neo4j():
515
+ print("Failed to connect to Neo4j. Use --dry-run to test extraction without database.")
516
+ return
517
+
518
+ try:
519
+ if args.file:
520
+ # Process single file
521
+ if os.path.isfile(args.file):
522
+ extractor.process_file(args.file, args.dry_run)
523
+ if not args.dry_run:
524
+ extractor.update_state(args.file)
525
+ else:
526
+ print(f"File not found: {args.file}")
527
+ else:
528
+ # Process all new/modified files
529
+ files = extractor.get_files_to_process(args.force)
530
+
531
+ if not files:
532
+ print("No new or modified files to process")
533
+ return
534
+
535
+ print(f"Found {len(files)} files to process")
536
+
537
+ for file_path in files:
538
+ success = extractor.process_file(file_path, args.dry_run)
539
+ if success and not args.dry_run:
540
+ extractor.update_state(file_path)
541
+
542
+ # Print final stats
543
+ print(f"\n=== Extraction Complete ===")
544
+ print(f"Files processed: {extractor.stats['files_processed']}")
545
+ print(f"Entities extracted: {extractor.stats['entities_extracted']}")
546
+ print(f"Relationships extracted: {extractor.stats['relationships_extracted']}")
547
+ if not args.dry_run:
548
+ print(f"Entities created: {extractor.stats['entities_created']}")
549
+ print(f"Relationships created: {extractor.stats['relationships_created']}")
550
+ print(f"Errors: {extractor.stats['errors']}")
551
+
552
+ finally:
553
+ if not args.dry_run:
554
+ extractor.close_neo4j()
555
+
556
+ if __name__ == "__main__":
557
+ main()