lwazi 1.7.3 → 1.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json
CHANGED
|
@@ -6,6 +6,7 @@ use DOMDocument;
|
|
|
6
6
|
use DOMXPath;
|
|
7
7
|
use Illuminate\Support\Facades\Http;
|
|
8
8
|
use Illuminate\Support\Str;
|
|
9
|
+
use Lwazi\Core\Services\ContentIndexer;
|
|
9
10
|
|
|
10
11
|
class NavigationCrawler
|
|
11
12
|
{
|
|
@@ -20,13 +21,15 @@ class NavigationCrawler
|
|
|
20
21
|
protected bool $debug;
|
|
21
22
|
protected ?string $stateFile = null;
|
|
22
23
|
protected int $retryAttempts = 3;
|
|
23
|
-
protected int $retryDelay = 1000;
|
|
24
|
+
protected int $retryDelay = 1000;
|
|
25
|
+
protected ContentIndexer $contentIndexer;
|
|
24
26
|
|
|
25
27
|
public function __construct(string $rootUrl, bool $debug = false)
|
|
26
28
|
{
|
|
27
29
|
$this->rootUrl = rtrim($rootUrl, '/');
|
|
28
30
|
$this->debug = $debug;
|
|
29
31
|
$this->stateFile = storage_path('lwazi/crawl_state.json');
|
|
32
|
+
$this->contentIndexer = new ContentIndexer();
|
|
30
33
|
}
|
|
31
34
|
|
|
32
35
|
public function setMaxPages(int $max): self
|
|
@@ -83,6 +86,8 @@ class NavigationCrawler
|
|
|
83
86
|
|
|
84
87
|
$this->visited[$url] = true;
|
|
85
88
|
|
|
89
|
+
$this->contentIndexer->indexPage($url, $html);
|
|
90
|
+
|
|
86
91
|
$meta = $this->extractPageMeta($html, $url);
|
|
87
92
|
$this->nodes[$url] = $meta;
|
|
88
93
|
$this->flatIndex[$url] = [
|
|
@@ -401,6 +406,10 @@ class NavigationCrawler
|
|
|
401
406
|
|
|
402
407
|
file_put_contents($path, json_encode($manifest, JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES));
|
|
403
408
|
$this->log("Manifest saved to $path");
|
|
409
|
+
|
|
410
|
+
$contentPath = storage_path('lwazi/content_index.json');
|
|
411
|
+
file_put_contents($contentPath, json_encode($this->contentIndexer->toArray(), JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES));
|
|
412
|
+
$this->log("Content index saved to $contentPath");
|
|
404
413
|
}
|
|
405
414
|
|
|
406
415
|
protected function log(string $msg): void
|
|
@@ -0,0 +1,245 @@
|
|
|
1
|
+
<?php
|
|
2
|
+
|
|
3
|
+
namespace Lwazi\Core\Services;
|
|
4
|
+
|
|
5
|
+
class ContentIndexer
|
|
6
|
+
{
|
|
7
|
+
protected array $index = [];
|
|
8
|
+
protected array $documents = [];
|
|
9
|
+
protected array $termFrequency = [];
|
|
10
|
+
protected array $documentFrequency = [];
|
|
11
|
+
protected int $totalDocuments = 0;
|
|
12
|
+
|
|
13
|
+
public function indexPage(string $url, string $html): void
|
|
14
|
+
{
|
|
15
|
+
$content = $this->extractContent($url, $html);
|
|
16
|
+
|
|
17
|
+
if (empty($content['text'])) {
|
|
18
|
+
return;
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
$this->documents[$url] = $content;
|
|
22
|
+
$this->totalDocuments++;
|
|
23
|
+
|
|
24
|
+
$tokens = $this->tokenize($content['text']);
|
|
25
|
+
$uniqueTokens = array_unique($tokens);
|
|
26
|
+
|
|
27
|
+
foreach ($uniqueTokens as $token) {
|
|
28
|
+
if (!isset($this->documentFrequency[$token])) {
|
|
29
|
+
$this->documentFrequency[$token] = 0;
|
|
30
|
+
}
|
|
31
|
+
$this->documentFrequency[$token]++;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
if (!isset($this->termFrequency[$url])) {
|
|
35
|
+
$this->termFrequency[$url] = [];
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
foreach ($tokens as $token) {
|
|
39
|
+
if (!isset($this->termFrequency[$url][$token])) {
|
|
40
|
+
$this->termFrequency[$url][$token] = 0;
|
|
41
|
+
}
|
|
42
|
+
$this->termFrequency[$url][$token]++;
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
protected function extractContent(string $url, string $html): array
|
|
47
|
+
{
|
|
48
|
+
libxml_use_internal_errors(true);
|
|
49
|
+
$dom = new \DOMDocument();
|
|
50
|
+
@$dom->loadHTML($html);
|
|
51
|
+
libxml_clear_errors();
|
|
52
|
+
|
|
53
|
+
$xpath = new \DOMXPath($dom);
|
|
54
|
+
|
|
55
|
+
$title = '';
|
|
56
|
+
$titleNodes = $xpath->query('//title');
|
|
57
|
+
if ($titleNodes->length > 0) {
|
|
58
|
+
$title = trim($titleNodes->item(0)->textContent ?? '');
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
$headings = [];
|
|
62
|
+
foreach ([1, 2, 3, 4, 5, 6] as $level) {
|
|
63
|
+
$headingNodes = $xpath->query("//h{$level}");
|
|
64
|
+
foreach ($headingNodes as $node) {
|
|
65
|
+
$text = trim($node->textContent ?? '');
|
|
66
|
+
if ($text) {
|
|
67
|
+
$headings[] = $text;
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
$paragraphs = [];
|
|
73
|
+
$pNodes = $xpath->query('//p');
|
|
74
|
+
foreach ($pNodes as $node) {
|
|
75
|
+
$text = trim($node->textContent ?? '');
|
|
76
|
+
if ($text && strlen($text) > 20) {
|
|
77
|
+
$paragraphs[] = $text;
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
$lists = [];
|
|
82
|
+
$liNodes = $xpath->query('//ul//li | //ol//li');
|
|
83
|
+
foreach ($liNodes as $node) {
|
|
84
|
+
$text = trim($node->textContent ?? '');
|
|
85
|
+
if ($text && strlen($text) > 5) {
|
|
86
|
+
$lists[] = $text;
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
$text = $title . ' ' . implode(' ', $headings) . ' ' . implode(' ', $paragraphs) . ' ' . implode(' ', $lists);
|
|
91
|
+
$text = preg_replace('/\s+/', ' ', $text);
|
|
92
|
+
|
|
93
|
+
return [
|
|
94
|
+
'url' => $url,
|
|
95
|
+
'title' => $title,
|
|
96
|
+
'headings' => $headings,
|
|
97
|
+
'paragraphs' => array_slice($paragraphs, 0, 10),
|
|
98
|
+
'lists' => array_slice($lists, 0, 20),
|
|
99
|
+
'text' => $text,
|
|
100
|
+
];
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
protected function tokenize(string $text): array
|
|
104
|
+
{
|
|
105
|
+
$text = strtolower($text);
|
|
106
|
+
$text = preg_replace('/[^a-z0-9\s]/', ' ', $text);
|
|
107
|
+
$tokens = preg_split('/\s+/', $text);
|
|
108
|
+
|
|
109
|
+
$stopWords = $this->getStopWords();
|
|
110
|
+
|
|
111
|
+
return array_values(array_filter($tokens, function($token) use ($stopWords) {
|
|
112
|
+
return strlen($token) >= 3 && !in_array($token, $stopWords);
|
|
113
|
+
}));
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
protected function getStopWords(): array
|
|
117
|
+
{
|
|
118
|
+
return [
|
|
119
|
+
'the', 'and', 'for', 'are', 'but', 'not', 'you', 'all', 'can', 'had',
|
|
120
|
+
'her', 'was', 'one', 'our', 'out', 'day', 'get', 'has', 'him', 'his',
|
|
121
|
+
'how', 'its', 'may', 'now', 'old', 'see', 'than', 'that', 'this', 'too',
|
|
122
|
+
'was', 'will', 'with', 'have', 'from', 'they', 'been', 'were', 'said',
|
|
123
|
+
'each', 'which', 'their', 'what', 'when', 'where', 'who', 'will',
|
|
124
|
+
'about', 'after', 'would', 'there', 'could', 'other', 'into', 'more',
|
|
125
|
+
'some', 'them', 'then', 'these', 'so', 'just', 'because', 'being',
|
|
126
|
+
'also', 'before', 'here', 'how', 'made', 'make', 'only', 'over',
|
|
127
|
+
'such', 'take', 'through', 'under', 'very', 'what', 'your', 'like',
|
|
128
|
+
];
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
public function search(array $terms, int $limit = 5): array
|
|
132
|
+
{
|
|
133
|
+
if (empty($terms) || $this->totalDocuments === 0) {
|
|
134
|
+
return [];
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
$scores = [];
|
|
138
|
+
|
|
139
|
+
foreach ($this->documents as $url => $doc) {
|
|
140
|
+
if (!isset($this->termFrequency[$url])) {
|
|
141
|
+
continue;
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
$score = 0;
|
|
145
|
+
$matchedTerms = [];
|
|
146
|
+
|
|
147
|
+
foreach ($terms as $term) {
|
|
148
|
+
$term = strtolower(trim($term));
|
|
149
|
+
if (strlen($term) < 3) continue;
|
|
150
|
+
|
|
151
|
+
if (isset($this->termFrequency[$url][$term])) {
|
|
152
|
+
$tf = $this->termFrequency[$url][$term];
|
|
153
|
+
$df = $this->documentFrequency[$term] ?? 1;
|
|
154
|
+
$idf = log($this->totalDocuments / $df);
|
|
155
|
+
$score += $tf * $idf;
|
|
156
|
+
$matchedTerms[] = $term;
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
if ($score > 0) {
|
|
161
|
+
$scores[$url] = [
|
|
162
|
+
'score' => $score,
|
|
163
|
+
'matched_terms' => $matchedTerms,
|
|
164
|
+
'doc' => $doc,
|
|
165
|
+
];
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
uasort($scores, fn($a, $b) => $b['score'] <=> $a['score']);
|
|
170
|
+
|
|
171
|
+
$results = [];
|
|
172
|
+
foreach (array_slice($scores, 0, $limit, true) as $url => $data) {
|
|
173
|
+
$results[] = [
|
|
174
|
+
'url' => $url,
|
|
175
|
+
'title' => $data['doc']['title'] ?? basename($url),
|
|
176
|
+
'matched_terms' => $data['matched_terms'],
|
|
177
|
+
'score' => $data['score'],
|
|
178
|
+
'snippet' => $this->generateSnippet($data['doc'], $data['matched_terms']),
|
|
179
|
+
];
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
return $results;
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
protected function generateSnippet(array $doc, array $terms): string
|
|
186
|
+
{
|
|
187
|
+
$text = implode(' ', $doc['paragraphs'] ?? []);
|
|
188
|
+
|
|
189
|
+
if (empty($text)) {
|
|
190
|
+
$text = implode(' ', $doc['lists'] ?? []);
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
if (empty($text)) {
|
|
194
|
+
return '';
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
$text = strtolower($text);
|
|
198
|
+
$bestPos = -1;
|
|
199
|
+
$bestLen = PHP_INT_MAX;
|
|
200
|
+
|
|
201
|
+
foreach ($terms as $term) {
|
|
202
|
+
$term = strtolower($term);
|
|
203
|
+
$pos = strpos($text, $term);
|
|
204
|
+
if ($pos !== false && $pos < $bestLen) {
|
|
205
|
+
$bestPos = $pos;
|
|
206
|
+
$bestLen = $pos;
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
if ($bestPos === -1) {
|
|
211
|
+
$bestPos = 0;
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
$start = max(0, $bestPos - 50);
|
|
215
|
+
$snippet = substr($text, $start, 200);
|
|
216
|
+
|
|
217
|
+
if ($start > 0) {
|
|
218
|
+
$snippet = '...' . $snippet;
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
$snippet = preg_replace('/\s+/', ' ', $snippet);
|
|
222
|
+
|
|
223
|
+
return trim($snippet) . '...';
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
public function toArray(): array
|
|
227
|
+
{
|
|
228
|
+
return [
|
|
229
|
+
'documents' => $this->documents,
|
|
230
|
+
'term_frequency' => $this->termFrequency,
|
|
231
|
+
'document_frequency' => $this->documentFrequency,
|
|
232
|
+
'total_documents' => $this->totalDocuments,
|
|
233
|
+
];
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
public static function fromArray(array $data): self
|
|
237
|
+
{
|
|
238
|
+
$indexer = new self();
|
|
239
|
+
$indexer->documents = $data['documents'] ?? [];
|
|
240
|
+
$indexer->termFrequency = $data['term_frequency'] ?? [];
|
|
241
|
+
$indexer->documentFrequency = $data['document_frequency'] ?? [];
|
|
242
|
+
$indexer->totalDocuments = $data['total_documents'] ?? 0;
|
|
243
|
+
return $indexer;
|
|
244
|
+
}
|
|
245
|
+
}
|
|
@@ -76,6 +76,12 @@ class LwaziService
|
|
|
76
76
|
}
|
|
77
77
|
}
|
|
78
78
|
|
|
79
|
+
$contentResponse = $this->searchContent($message);
|
|
80
|
+
if ($contentResponse) {
|
|
81
|
+
$this->conversationHistory[] = ['role' => 'assistant', 'content' => $contentResponse];
|
|
82
|
+
return $contentResponse;
|
|
83
|
+
}
|
|
84
|
+
|
|
79
85
|
$dataResponse = $this->fetchRelevantData($message);
|
|
80
86
|
if ($dataResponse) {
|
|
81
87
|
$this->conversationHistory[] = ['role' => 'assistant', 'content' => $dataResponse];
|
|
@@ -217,6 +223,80 @@ class LwaziService
|
|
|
217
223
|
return null;
|
|
218
224
|
}
|
|
219
225
|
|
|
226
|
+
protected function searchContent(string $message): ?string
|
|
227
|
+
{
|
|
228
|
+
$contentPath = storage_path('lwazi/content_index.json');
|
|
229
|
+
if (!file_exists($contentPath)) {
|
|
230
|
+
return null;
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
$data = json_decode(file_get_contents($contentPath), true);
|
|
234
|
+
if (!$data || empty($data['documents'])) {
|
|
235
|
+
return null;
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
$indexer = ContentIndexer::fromArray($data);
|
|
239
|
+
|
|
240
|
+
$terms = $this->extractSearchTerms($message);
|
|
241
|
+
if (empty($terms)) {
|
|
242
|
+
return null;
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
$results = $indexer->search($terms, 3);
|
|
246
|
+
|
|
247
|
+
if (empty($results)) {
|
|
248
|
+
return null;
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
$response = "I found some relevant information:\n\n";
|
|
252
|
+
|
|
253
|
+
foreach ($results as $result) {
|
|
254
|
+
$response .= "**{$result['title']}**\n";
|
|
255
|
+
$response .= "{$result['snippet']}\n";
|
|
256
|
+
$response .= "[Read more]({$result['url']})\n\n";
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
return $response;
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
protected function extractSearchTerms(string $message): array
|
|
263
|
+
{
|
|
264
|
+
preg_match('/\b(my|the|their|our|all|this|these)\s+(\w+)/i', $message, $matches);
|
|
265
|
+
$topic = $matches[2] ?? '';
|
|
266
|
+
|
|
267
|
+
if (empty($topic) || strlen($topic) < 3) {
|
|
268
|
+
$words = preg_split('/\s+/', strtolower($message));
|
|
269
|
+
$stopWords = ['where', 'can', 'find', 'get', 'look', 'show', 'list', 'tell', 'know', 'want', 'need', 'help', 'give', 'me', 'i', 'is', 'are', 'was', 'were', 'has', 'have', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'what', 'how', 'why'];
|
|
270
|
+
foreach ($words as $w) {
|
|
271
|
+
if (strlen($w) >= 4 && !in_array($w, $stopWords)) {
|
|
272
|
+
$topic = $w;
|
|
273
|
+
break;
|
|
274
|
+
}
|
|
275
|
+
}
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
if (empty($topic) || strlen($topic) < 3) {
|
|
279
|
+
return [];
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
$prompt = "List 5 synonyms for: {$topic}. Return ONLY JSON array like: [\"word1\",\"word2\"]";
|
|
283
|
+
|
|
284
|
+
$response = $this->callOllama([
|
|
285
|
+
['role' => 'system', 'content' => 'Return only valid JSON.'],
|
|
286
|
+
['role' => 'user', 'content' => $prompt],
|
|
287
|
+
]);
|
|
288
|
+
|
|
289
|
+
$json = $this->extractJson($response['content'] ?? '');
|
|
290
|
+
$terms = is_array($json) ? array_values($json) : [];
|
|
291
|
+
|
|
292
|
+
$first = explode(' ', $topic)[0];
|
|
293
|
+
if (strlen($first) >= 3) {
|
|
294
|
+
$terms[] = strtolower($first);
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
return $terms;
|
|
298
|
+
}
|
|
299
|
+
|
|
220
300
|
protected function findBestRouteWithTerms(array $routes, array $terms): ?string
|
|
221
301
|
{
|
|
222
302
|
$scored = [];
|