lwazi 1.7.3 → 1.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "lwazi",
3
- "version": "1.7.3",
3
+ "version": "1.8.0",
4
4
  "description": "Lwazi is an AI assistant for Laravel. Install with one command to add an AI assistant to your Laravel app.",
5
5
  "main": "bin/lwazi.js",
6
6
  "bin": {
@@ -6,6 +6,7 @@ use DOMDocument;
6
6
  use DOMXPath;
7
7
  use Illuminate\Support\Facades\Http;
8
8
  use Illuminate\Support\Str;
9
+ use Lwazi\Core\Services\ContentIndexer;
9
10
 
10
11
  class NavigationCrawler
11
12
  {
@@ -20,13 +21,15 @@ class NavigationCrawler
20
21
  protected bool $debug;
21
22
  protected ?string $stateFile = null;
22
23
  protected int $retryAttempts = 3;
23
- protected int $retryDelay = 1000; // milliseconds
24
+ protected int $retryDelay = 1000;
25
+ protected ContentIndexer $contentIndexer;
24
26
 
25
27
  public function __construct(string $rootUrl, bool $debug = false)
26
28
  {
27
29
  $this->rootUrl = rtrim($rootUrl, '/');
28
30
  $this->debug = $debug;
29
31
  $this->stateFile = storage_path('lwazi/crawl_state.json');
32
+ $this->contentIndexer = new ContentIndexer();
30
33
  }
31
34
 
32
35
  public function setMaxPages(int $max): self
@@ -83,6 +86,8 @@ class NavigationCrawler
83
86
 
84
87
  $this->visited[$url] = true;
85
88
 
89
+ $this->contentIndexer->indexPage($url, $html);
90
+
86
91
  $meta = $this->extractPageMeta($html, $url);
87
92
  $this->nodes[$url] = $meta;
88
93
  $this->flatIndex[$url] = [
@@ -401,6 +406,10 @@ class NavigationCrawler
401
406
 
402
407
  file_put_contents($path, json_encode($manifest, JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES));
403
408
  $this->log("Manifest saved to $path");
409
+
410
+ $contentPath = storage_path('lwazi/content_index.json');
411
+ file_put_contents($contentPath, json_encode($this->contentIndexer->toArray(), JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES));
412
+ $this->log("Content index saved to $contentPath");
404
413
  }
405
414
 
406
415
  protected function log(string $msg): void
@@ -0,0 +1,245 @@
1
+ <?php
2
+
3
+ namespace Lwazi\Core\Services;
4
+
5
+ class ContentIndexer
6
+ {
7
+ protected array $index = [];
8
+ protected array $documents = [];
9
+ protected array $termFrequency = [];
10
+ protected array $documentFrequency = [];
11
+ protected int $totalDocuments = 0;
12
+
13
+ public function indexPage(string $url, string $html): void
14
+ {
15
+ $content = $this->extractContent($url, $html);
16
+
17
+ if (empty($content['text'])) {
18
+ return;
19
+ }
20
+
21
+ $this->documents[$url] = $content;
22
+ $this->totalDocuments++;
23
+
24
+ $tokens = $this->tokenize($content['text']);
25
+ $uniqueTokens = array_unique($tokens);
26
+
27
+ foreach ($uniqueTokens as $token) {
28
+ if (!isset($this->documentFrequency[$token])) {
29
+ $this->documentFrequency[$token] = 0;
30
+ }
31
+ $this->documentFrequency[$token]++;
32
+ }
33
+
34
+ if (!isset($this->termFrequency[$url])) {
35
+ $this->termFrequency[$url] = [];
36
+ }
37
+
38
+ foreach ($tokens as $token) {
39
+ if (!isset($this->termFrequency[$url][$token])) {
40
+ $this->termFrequency[$url][$token] = 0;
41
+ }
42
+ $this->termFrequency[$url][$token]++;
43
+ }
44
+ }
45
+
46
+ protected function extractContent(string $url, string $html): array
47
+ {
48
+ libxml_use_internal_errors(true);
49
+ $dom = new \DOMDocument();
50
+ @$dom->loadHTML($html);
51
+ libxml_clear_errors();
52
+
53
+ $xpath = new \DOMXPath($dom);
54
+
55
+ $title = '';
56
+ $titleNodes = $xpath->query('//title');
57
+ if ($titleNodes->length > 0) {
58
+ $title = trim($titleNodes->item(0)->textContent ?? '');
59
+ }
60
+
61
+ $headings = [];
62
+ foreach ([1, 2, 3, 4, 5, 6] as $level) {
63
+ $headingNodes = $xpath->query("//h{$level}");
64
+ foreach ($headingNodes as $node) {
65
+ $text = trim($node->textContent ?? '');
66
+ if ($text) {
67
+ $headings[] = $text;
68
+ }
69
+ }
70
+ }
71
+
72
+ $paragraphs = [];
73
+ $pNodes = $xpath->query('//p');
74
+ foreach ($pNodes as $node) {
75
+ $text = trim($node->textContent ?? '');
76
+ if ($text && strlen($text) > 20) {
77
+ $paragraphs[] = $text;
78
+ }
79
+ }
80
+
81
+ $lists = [];
82
+ $liNodes = $xpath->query('//ul//li | //ol//li');
83
+ foreach ($liNodes as $node) {
84
+ $text = trim($node->textContent ?? '');
85
+ if ($text && strlen($text) > 5) {
86
+ $lists[] = $text;
87
+ }
88
+ }
89
+
90
+ $text = $title . ' ' . implode(' ', $headings) . ' ' . implode(' ', $paragraphs) . ' ' . implode(' ', $lists);
91
+ $text = preg_replace('/\s+/', ' ', $text);
92
+
93
+ return [
94
+ 'url' => $url,
95
+ 'title' => $title,
96
+ 'headings' => $headings,
97
+ 'paragraphs' => array_slice($paragraphs, 0, 10),
98
+ 'lists' => array_slice($lists, 0, 20),
99
+ 'text' => $text,
100
+ ];
101
+ }
102
+
103
+ protected function tokenize(string $text): array
104
+ {
105
+ $text = strtolower($text);
106
+ $text = preg_replace('/[^a-z0-9\s]/', ' ', $text);
107
+ $tokens = preg_split('/\s+/', $text);
108
+
109
+ $stopWords = $this->getStopWords();
110
+
111
+ return array_values(array_filter($tokens, function($token) use ($stopWords) {
112
+ return strlen($token) >= 3 && !in_array($token, $stopWords);
113
+ }));
114
+ }
115
+
116
+ protected function getStopWords(): array
117
+ {
118
+ return [
119
+ 'the', 'and', 'for', 'are', 'but', 'not', 'you', 'all', 'can', 'had',
120
+ 'her', 'was', 'one', 'our', 'out', 'day', 'get', 'has', 'him', 'his',
121
+ 'how', 'its', 'may', 'now', 'old', 'see', 'than', 'that', 'this', 'too',
122
+ 'was', 'will', 'with', 'have', 'from', 'they', 'been', 'were', 'said',
123
+ 'each', 'which', 'their', 'what', 'when', 'where', 'who', 'will',
124
+ 'about', 'after', 'would', 'there', 'could', 'other', 'into', 'more',
125
+ 'some', 'them', 'then', 'these', 'so', 'just', 'because', 'being',
126
+ 'also', 'before', 'here', 'how', 'made', 'make', 'only', 'over',
127
+ 'such', 'take', 'through', 'under', 'very', 'what', 'your', 'like',
128
+ ];
129
+ }
130
+
131
+ public function search(array $terms, int $limit = 5): array
132
+ {
133
+ if (empty($terms) || $this->totalDocuments === 0) {
134
+ return [];
135
+ }
136
+
137
+ $scores = [];
138
+
139
+ foreach ($this->documents as $url => $doc) {
140
+ if (!isset($this->termFrequency[$url])) {
141
+ continue;
142
+ }
143
+
144
+ $score = 0;
145
+ $matchedTerms = [];
146
+
147
+ foreach ($terms as $term) {
148
+ $term = strtolower(trim($term));
149
+ if (strlen($term) < 3) continue;
150
+
151
+ if (isset($this->termFrequency[$url][$term])) {
152
+ $tf = $this->termFrequency[$url][$term];
153
+ $df = $this->documentFrequency[$term] ?? 1;
154
+ $idf = log($this->totalDocuments / $df);
155
+ $score += $tf * $idf;
156
+ $matchedTerms[] = $term;
157
+ }
158
+ }
159
+
160
+ if ($score > 0) {
161
+ $scores[$url] = [
162
+ 'score' => $score,
163
+ 'matched_terms' => $matchedTerms,
164
+ 'doc' => $doc,
165
+ ];
166
+ }
167
+ }
168
+
169
+ uasort($scores, fn($a, $b) => $b['score'] <=> $a['score']);
170
+
171
+ $results = [];
172
+ foreach (array_slice($scores, 0, $limit, true) as $url => $data) {
173
+ $results[] = [
174
+ 'url' => $url,
175
+ 'title' => $data['doc']['title'] ?? basename($url),
176
+ 'matched_terms' => $data['matched_terms'],
177
+ 'score' => $data['score'],
178
+ 'snippet' => $this->generateSnippet($data['doc'], $data['matched_terms']),
179
+ ];
180
+ }
181
+
182
+ return $results;
183
+ }
184
+
185
+ protected function generateSnippet(array $doc, array $terms): string
186
+ {
187
+ $text = implode(' ', $doc['paragraphs'] ?? []);
188
+
189
+ if (empty($text)) {
190
+ $text = implode(' ', $doc['lists'] ?? []);
191
+ }
192
+
193
+ if (empty($text)) {
194
+ return '';
195
+ }
196
+
197
+ $text = strtolower($text);
198
+ $bestPos = -1;
199
+ $bestLen = PHP_INT_MAX;
200
+
201
+ foreach ($terms as $term) {
202
+ $term = strtolower($term);
203
+ $pos = strpos($text, $term);
204
+ if ($pos !== false && $pos < $bestLen) {
205
+ $bestPos = $pos;
206
+ $bestLen = $pos;
207
+ }
208
+ }
209
+
210
+ if ($bestPos === -1) {
211
+ $bestPos = 0;
212
+ }
213
+
214
+ $start = max(0, $bestPos - 50);
215
+ $snippet = substr($text, $start, 200);
216
+
217
+ if ($start > 0) {
218
+ $snippet = '...' . $snippet;
219
+ }
220
+
221
+ $snippet = preg_replace('/\s+/', ' ', $snippet);
222
+
223
+ return trim($snippet) . '...';
224
+ }
225
+
226
+ public function toArray(): array
227
+ {
228
+ return [
229
+ 'documents' => $this->documents,
230
+ 'term_frequency' => $this->termFrequency,
231
+ 'document_frequency' => $this->documentFrequency,
232
+ 'total_documents' => $this->totalDocuments,
233
+ ];
234
+ }
235
+
236
+ public static function fromArray(array $data): self
237
+ {
238
+ $indexer = new self();
239
+ $indexer->documents = $data['documents'] ?? [];
240
+ $indexer->termFrequency = $data['term_frequency'] ?? [];
241
+ $indexer->documentFrequency = $data['document_frequency'] ?? [];
242
+ $indexer->totalDocuments = $data['total_documents'] ?? 0;
243
+ return $indexer;
244
+ }
245
+ }
@@ -76,6 +76,12 @@ class LwaziService
76
76
  }
77
77
  }
78
78
 
79
+ $contentResponse = $this->searchContent($message);
80
+ if ($contentResponse) {
81
+ $this->conversationHistory[] = ['role' => 'assistant', 'content' => $contentResponse];
82
+ return $contentResponse;
83
+ }
84
+
79
85
  $dataResponse = $this->fetchRelevantData($message);
80
86
  if ($dataResponse) {
81
87
  $this->conversationHistory[] = ['role' => 'assistant', 'content' => $dataResponse];
@@ -217,6 +223,80 @@ class LwaziService
217
223
  return null;
218
224
  }
219
225
 
226
+ protected function searchContent(string $message): ?string
227
+ {
228
+ $contentPath = storage_path('lwazi/content_index.json');
229
+ if (!file_exists($contentPath)) {
230
+ return null;
231
+ }
232
+
233
+ $data = json_decode(file_get_contents($contentPath), true);
234
+ if (!$data || empty($data['documents'])) {
235
+ return null;
236
+ }
237
+
238
+ $indexer = ContentIndexer::fromArray($data);
239
+
240
+ $terms = $this->extractSearchTerms($message);
241
+ if (empty($terms)) {
242
+ return null;
243
+ }
244
+
245
+ $results = $indexer->search($terms, 3);
246
+
247
+ if (empty($results)) {
248
+ return null;
249
+ }
250
+
251
+ $response = "I found some relevant information:\n\n";
252
+
253
+ foreach ($results as $result) {
254
+ $response .= "**{$result['title']}**\n";
255
+ $response .= "{$result['snippet']}\n";
256
+ $response .= "[Read more]({$result['url']})\n\n";
257
+ }
258
+
259
+ return $response;
260
+ }
261
+
262
+ protected function extractSearchTerms(string $message): array
263
+ {
264
+ preg_match('/\b(my|the|their|our|all|this|these)\s+(\w+)/i', $message, $matches);
265
+ $topic = $matches[2] ?? '';
266
+
267
+ if (empty($topic) || strlen($topic) < 3) {
268
+ $words = preg_split('/\s+/', strtolower($message));
269
+ $stopWords = ['where', 'can', 'find', 'get', 'look', 'show', 'list', 'tell', 'know', 'want', 'need', 'help', 'give', 'me', 'i', 'is', 'are', 'was', 'were', 'has', 'have', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'what', 'how', 'why'];
270
+ foreach ($words as $w) {
271
+ if (strlen($w) >= 4 && !in_array($w, $stopWords)) {
272
+ $topic = $w;
273
+ break;
274
+ }
275
+ }
276
+ }
277
+
278
+ if (empty($topic) || strlen($topic) < 3) {
279
+ return [];
280
+ }
281
+
282
+ $prompt = "List 5 synonyms for: {$topic}. Return ONLY JSON array like: [\"word1\",\"word2\"]";
283
+
284
+ $response = $this->callOllama([
285
+ ['role' => 'system', 'content' => 'Return only valid JSON.'],
286
+ ['role' => 'user', 'content' => $prompt],
287
+ ]);
288
+
289
+ $json = $this->extractJson($response['content'] ?? '');
290
+ $terms = is_array($json) ? array_values($json) : [];
291
+
292
+ $first = explode(' ', $topic)[0];
293
+ if (strlen($first) >= 3) {
294
+ $terms[] = strtolower($first);
295
+ }
296
+
297
+ return $terms;
298
+ }
299
+
220
300
  protected function findBestRouteWithTerms(array $routes, array $terms): ?string
221
301
  {
222
302
  $scored = [];