opencode-semantic-memory 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,287 @@
1
+ """Pattern-based extractors for decisions, blockers, and learnings from text."""
2
+
3
+ import re
4
+ from dataclasses import dataclass
5
+
6
+ DECISION_PATTERNS = [
7
+ (
8
+ r"(?:we |I )?(?:decided|chose|went with|selected|opted for)\s+to\s+(.{20,300}?)(?:\.|$)",
9
+ True,
10
+ ),
11
+ (r"(?:the )?decision(?:\s+is)?\s*(?:to|:)\s+(.{20,300}?)(?:\.|$)", True),
12
+ (r"(?:we'll |I'll |we will |I will )(.{20,250}?)(?:\s+because|\s+since|\.|$)", True),
13
+ (
14
+ r"going to\s+(?:use|implement|create|build|add|remove|change|update|fix)\s+(.{15,200}?)(?:\.|$)",
15
+ False,
16
+ ),
17
+ ]
18
+
19
+ BLOCKER_PATTERNS = [
20
+ # Explicit blocker declarations - high confidence
21
+ (r"(?:the )?blocker(?:\s+is)?(?:\s*:\s*|\s+)(.{20,250}?)(?:\.|$)", True),
22
+ (r"blocking issue(?:\s*:\s*|\s+is\s+)(.{20,250}?)(?:\.|$)", True),
23
+ # Blocked by pattern - common usage
24
+ (r"(?:we're|we are|I'm|I am)\s+blocked\s+by\s+(.{20,200}?)(?:\.|$)", True),
25
+ (r"blocked\s+by\s+(?:the\s+)?(.{20,200}?)(?:\.|$)", True),
26
+ # Waiting on/for pattern - with human context (capture the whole thing including approval/review)
27
+ (
28
+ r"[Ww]aiting\s+(?:on|for)\s+((?:approval|review|feedback|response|sign-off|authorization).{10,200}?)(?:\.|$)",
29
+ True,
30
+ ),
31
+ (r"(?:we're|we are|I'm|I am)\s+waiting\s+(?:on|for)\s+(.{20,200}?)(?:\.|$)", True),
32
+ # Can't merge/ship patterns
33
+ (r"[Cc]an't\s+(?:merge|ship|release|deploy)\s+(?:until|because)\s+(.{20,200}?)(?:\.|$)", True),
34
+ # Needs to wait pattern
35
+ (r"need(?:s)?\s+to\s+wait\s+(?:for\s+)?(.{20,200}?)(?:\s+before|\.|$)", True),
36
+ ]
37
+
38
+ # Patterns that indicate something is NOT a real blocker (technical waits, code behavior)
39
+ BLOCKER_FALSE_POSITIVE_PATTERNS = [
40
+ r"(?:the |a )?(?:code|function|method|test|script|loop|task|job|process)\s+(?:is\s+)?(?:waiting|blocked)",
41
+ r"wait(?:ing)?\s+(?:for|on)\s+(?:the\s+)?(?:response|result|callback|promise|async|event|signal|input|output)",
42
+ r"(?:need|waiting)\s+(?:for\s+)?(?:the\s+)?(?:UI|DOM|page|component|element|render|load)",
43
+ r"(?:waiting|blocked)\s+(?:on|for)\s+(?:the\s+)?(?:lock|mutex|semaphore|thread|connection)",
44
+ r"(?:await|waiting)\s+(?:for\s+)?(?:the\s+)?(?:model|embedding|transport|stream|socket)",
45
+ r"pipeline\s+to\s+(?:complete|finish|pass)",
46
+ r"(?:waiting|need)\s+(?:for\s+)?(?:it|this|that)\s+to\s+(?:complete|finish|load|ready)",
47
+ ]
48
+
49
+ LEARNING_PATTERNS = [
50
+ (r"(?:TIL|today I learned)\s*[:\s]+(.{20,250}?)(?:\.|$)", True),
51
+ (r"(?:discovered|found out|realized)\s+that\s+(.{20,250}?)(?:\.|$)", True),
52
+ (r"turns out\s+(?:that\s+)?(.{20,250}?)(?:\.|$)", True),
53
+ (
54
+ r"(?:the |a )?(?:key|important)\s+(?:insight|takeaway|lesson)\s+(?:is\s+)?(?:that\s+)?(.{20,250}?)(?:\.|$)",
55
+ True,
56
+ ),
57
+ (r"(?:I |we )?learned\s+that\s+(.{20,250}?)(?:\.|$)", True),
58
+ ]
59
+
60
+ CODE_INDICATORS = [
61
+ r"^\s*[{}\[\]();,]",
62
+ r"[{}\[\]]",
63
+ r"^\s*(?:def|class|function|const|let|var|import|from|return|if|else|for|while)\s",
64
+ r"^\s*[A-Z_]{2,}\s*[=:]",
65
+ r"^\s*\w+\s*=\s*[{\[\(]",
66
+ r"^\s*#\s*\w+",
67
+ r"^\s*//",
68
+ r"^\s*\*\s",
69
+ r"^\s*-\s*\[",
70
+ r"^\s*```",
71
+ r"^\s*\|",
72
+ r"^\s*>",
73
+ r"TEXT,?\s*--",
74
+ r"^\s*\d+:\s",
75
+ r"\.(?:rb|py|js|ts|go|rs|java|cpp|c|h|yml|yaml|json|md|txt)\s*$",
76
+ r":\s*\w+\s*}",
77
+ ]
78
+
79
+ NOISE_PATTERNS = [
80
+ r"^(?:fix|update|add|remove|change|check|test|run|see|look|try|use)\s+\w+$",
81
+ r"^\w+\s+\w+$",
82
+ r"^the\s+\w+$",
83
+ r"^\d+",
84
+ r"^[^a-zA-Z]*$",
85
+ r'["\']$',
86
+ r"^\s*$",
87
+ r"^it is\s+",
88
+ ]
89
+
90
+ FRAGMENT_INDICATORS = [
91
+ r"^\*+\s",
92
+ r"^-+\s",
93
+ r"^\|",
94
+ r"^>",
95
+ r"^#",
96
+ r"^\d+\.\s",
97
+ r"^\d+\)\s",
98
+ r"^[a-z]+\.\s",
99
+ r"^[a-z]+,\s",
100
+ r"\*\*\s*$",
101
+ r"\*\*\s*-",
102
+ r"\s+\+\s+",
103
+ r"\s+-\s+",
104
+ ]
105
+
106
+ BROKEN_START_PATTERNS = [
107
+ r"^and\s+(?:the|a|if|when|also)\b",
108
+ r"^or\s+(?:the|a|if|when|also)\b",
109
+ r"^but\s+(?:the|a|now|if|when)\b",
110
+ r"^[a-z]+\s+and\s+if\b",
111
+ ]
112
+
113
+ MIN_WORD_COUNT = 4
114
+ MIN_CONTENT_LENGTH = 25
115
+ MAX_CONTENT_LENGTH = 400
116
+
117
+
118
+ @dataclass
119
+ class ExtractedInsight:
120
+ """An insight extracted from text."""
121
+
122
+ category: str
123
+ content: str
124
+ context: str
125
+
126
+
127
+ def _is_code_fragment(text: str) -> bool:
128
+ """Check if text looks like a code fragment."""
129
+ for pattern in CODE_INDICATORS:
130
+ if re.search(pattern, text, re.MULTILINE | re.IGNORECASE):
131
+ return True
132
+ return False
133
+
134
+
135
+ def _is_noise(text: str) -> bool:
136
+ """Check if text is likely noise."""
137
+ for pattern in NOISE_PATTERNS:
138
+ if re.match(pattern, text.strip(), re.IGNORECASE):
139
+ return True
140
+ return False
141
+
142
+
143
+ def _clean_content(text: str) -> str:
144
+ """Clean extracted content."""
145
+ text = re.sub(r"\s+", " ", text)
146
+ text = re.sub(r"^[^\w]+", "", text)
147
+ text = re.sub(r"[^\w.!?]+$", "", text)
148
+ text = text.strip()
149
+ return text
150
+
151
+
152
+ def _is_fragment(text: str) -> bool:
153
+ """Check if text looks like a broken fragment (markdown artifacts, list items, etc)."""
154
+ text = text.strip()
155
+ if not text:
156
+ return True
157
+
158
+ for pattern in FRAGMENT_INDICATORS:
159
+ if re.search(pattern, text):
160
+ return True
161
+
162
+ for pattern in BROKEN_START_PATTERNS:
163
+ if re.match(pattern, text, re.IGNORECASE):
164
+ return True
165
+
166
+ if text and text[-1] in ";":
167
+ return True
168
+
169
+ return False
170
+
171
+
172
+ def _is_valid_insight(content: str) -> bool:
173
+ """Check if content is a valid insight worth storing."""
174
+ if len(content) < MIN_CONTENT_LENGTH:
175
+ return False
176
+
177
+ if len(content) > MAX_CONTENT_LENGTH:
178
+ return False
179
+
180
+ word_count = len(content.split())
181
+ if word_count < MIN_WORD_COUNT:
182
+ return False
183
+
184
+ if _is_code_fragment(content):
185
+ return False
186
+
187
+ if _is_noise(content):
188
+ return False
189
+
190
+ if _is_fragment(content):
191
+ return False
192
+
193
+ alpha_chars = sum(1 for c in content if c.isalpha())
194
+ if alpha_chars < len(content) * 0.5:
195
+ return False
196
+
197
+ return True
198
+
199
+
200
+ def _normalize_for_dedup(text: str) -> str:
201
+ """Normalize text for deduplication comparison."""
202
+ words = re.findall(r"\w+", text.lower())
203
+ return " ".join(sorted(set(words)))
204
+
205
+
206
+ def _is_duplicate(content: str, seen_contents: set[str]) -> bool:
207
+ """Check if content is a duplicate of something already seen."""
208
+ content_words = set(re.findall(r"\w{3,}", content.lower()))
209
+
210
+ for seen in seen_contents:
211
+ seen_words = set(re.findall(r"\w{3,}", seen.lower()))
212
+ if not content_words or not seen_words:
213
+ continue
214
+
215
+ common_words = content_words & seen_words
216
+ smaller_set = min(len(content_words), len(seen_words))
217
+
218
+ if smaller_set > 0 and len(common_words) / smaller_set > 0.6:
219
+ return True
220
+
221
+ return False
222
+
223
+
224
+ def _extract_with_patterns(
225
+ text: str, patterns: list[tuple[str, bool]], category: str
226
+ ) -> list[ExtractedInsight]:
227
+ """Extract insights using a list of patterns."""
228
+ insights = []
229
+ seen_contents: set[str] = set()
230
+
231
+ for pattern, _strict in patterns:
232
+ for match in re.finditer(pattern, text, re.IGNORECASE | re.MULTILINE):
233
+ content = _clean_content(match.group(1))
234
+
235
+ if not _is_valid_insight(content):
236
+ continue
237
+
238
+ if _is_duplicate(content, seen_contents):
239
+ continue
240
+ seen_contents.add(content)
241
+
242
+ start = max(0, match.start() - 100)
243
+ end = min(len(text), match.end() + 100)
244
+ context = text[start:end].strip()
245
+
246
+ insights.append(ExtractedInsight(category=category, content=content, context=context))
247
+
248
+ return insights
249
+
250
+
251
+ def extract_decisions(text: str) -> list[ExtractedInsight]:
252
+ """Extract decision statements from text."""
253
+ return _extract_with_patterns(text, DECISION_PATTERNS, "decision")
254
+
255
+
256
+ def _is_blocker_false_positive(content: str, context: str) -> bool:
257
+ """Check if a blocker match is likely a false positive."""
258
+ combined = f"{context} {content}".lower()
259
+ for pattern in BLOCKER_FALSE_POSITIVE_PATTERNS:
260
+ if re.search(pattern, combined, re.IGNORECASE):
261
+ return True
262
+ return False
263
+
264
+
265
+ def extract_blockers(text: str) -> list[ExtractedInsight]:
266
+ """Extract blocker statements from text."""
267
+ raw_insights = _extract_with_patterns(text, BLOCKER_PATTERNS, "blocker")
268
+ # Filter out false positives
269
+ return [
270
+ insight
271
+ for insight in raw_insights
272
+ if not _is_blocker_false_positive(insight.content, insight.context)
273
+ ]
274
+
275
+
276
+ def extract_learnings(text: str) -> list[ExtractedInsight]:
277
+ """Extract learning/insight statements from text."""
278
+ return _extract_with_patterns(text, LEARNING_PATTERNS, "fact")
279
+
280
+
281
+ def extract_all_insights(text: str) -> list[ExtractedInsight]:
282
+ """Extract all types of insights from text."""
283
+ insights = []
284
+ insights.extend(extract_decisions(text))
285
+ insights.extend(extract_blockers(text))
286
+ insights.extend(extract_learnings(text))
287
+ return insights