npm - lwazi - Versions diffs - 1.7.3 → 1.8.0 - Mend

lwazi 1.7.3 → 1.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/package.json +1 -1
package/src/Installer/NavigationCrawler.php +10 -1
package/src/Services/ContentIndexer.php +245 -0
package/src/Services/LwaziService.php +80 -0

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "lwazi",
-  "version": "1.7.3",
+  "version": "1.8.0",
   "description": "Lwazi is an AI assistant for Laravel. Install with one command to add an AI assistant to your Laravel app.",
   "main": "bin/lwazi.js",
   "bin": {

package/src/Installer/NavigationCrawler.php CHANGED Viewed

@@ -6,6 +6,7 @@ use DOMDocument;
 use DOMXPath;
 use Illuminate\Support\Facades\Http;
 use Illuminate\Support\Str;
+use Lwazi\Core\Services\ContentIndexer;
 class NavigationCrawler
 {
@@ -20,13 +21,15 @@ class NavigationCrawler
     protected bool $debug;
     protected ?string $stateFile = null;
     protected int $retryAttempts = 3;
-    protected int $retryDelay = 1000; // milliseconds
+    protected int $retryDelay = 1000;
+    protected ContentIndexer $contentIndexer;
     public function __construct(string $rootUrl, bool $debug = false)
     {
         $this->rootUrl = rtrim($rootUrl, '/');
         $this->debug = $debug;
         $this->stateFile = storage_path('lwazi/crawl_state.json');
+        $this->contentIndexer = new ContentIndexer();
     }
     public function setMaxPages(int $max): self
@@ -83,6 +86,8 @@ class NavigationCrawler
             $this->visited[$url] = true;
+            $this->contentIndexer->indexPage($url, $html);
             $meta = $this->extractPageMeta($html, $url);
             $this->nodes[$url] = $meta;
             $this->flatIndex[$url] = [
@@ -401,6 +406,10 @@ class NavigationCrawler
         file_put_contents($path, json_encode($manifest, JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES));
         $this->log("Manifest saved to $path");
+        $contentPath = storage_path('lwazi/content_index.json');
+        file_put_contents($contentPath, json_encode($this->contentIndexer->toArray(), JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES));
+        $this->log("Content index saved to $contentPath");
     }
     protected function log(string $msg): void

package/src/Services/ContentIndexer.php ADDED Viewed

@@ -0,0 +1,245 @@
+<?php
+namespace Lwazi\Core\Services;
+class ContentIndexer
+{
+    protected array $index = [];
+    protected array $documents = [];
+    protected array $termFrequency = [];
+    protected array $documentFrequency = [];
+    protected int $totalDocuments = 0;
+    public function indexPage(string $url, string $html): void
+    {
+        $content = $this->extractContent($url, $html);
+        if (empty($content['text'])) {
+            return;
+        }
+        $this->documents[$url] = $content;
+        $this->totalDocuments++;
+        $tokens = $this->tokenize($content['text']);
+        $uniqueTokens = array_unique($tokens);
+        foreach ($uniqueTokens as $token) {
+            if (!isset($this->documentFrequency[$token])) {
+                $this->documentFrequency[$token] = 0;
+            }
+            $this->documentFrequency[$token]++;
+        }
+        if (!isset($this->termFrequency[$url])) {
+            $this->termFrequency[$url] = [];
+        }
+        foreach ($tokens as $token) {
+            if (!isset($this->termFrequency[$url][$token])) {
+                $this->termFrequency[$url][$token] = 0;
+            }
+            $this->termFrequency[$url][$token]++;
+        }
+    }
+    protected function extractContent(string $url, string $html): array
+    {
+        libxml_use_internal_errors(true);
+        $dom = new \DOMDocument();
+        @$dom->loadHTML($html);
+        libxml_clear_errors();
+        $xpath = new \DOMXPath($dom);
+        $title = '';
+        $titleNodes = $xpath->query('//title');
+        if ($titleNodes->length > 0) {
+            $title = trim($titleNodes->item(0)->textContent ?? '');
+        }
+        $headings = [];
+        foreach ([1, 2, 3, 4, 5, 6] as $level) {
+            $headingNodes = $xpath->query("//h{$level}");
+            foreach ($headingNodes as $node) {
+                $text = trim($node->textContent ?? '');
+                if ($text) {
+                    $headings[] = $text;
+                }
+            }
+        }
+        $paragraphs = [];
+        $pNodes = $xpath->query('//p');
+        foreach ($pNodes as $node) {
+            $text = trim($node->textContent ?? '');
+            if ($text && strlen($text) > 20) {
+                $paragraphs[] = $text;
+            }
+        }
+        $lists = [];
+        $liNodes = $xpath->query('//ul//li | //ol//li');
+        foreach ($liNodes as $node) {
+            $text = trim($node->textContent ?? '');
+            if ($text && strlen($text) > 5) {
+                $lists[] = $text;
+            }
+        }
+        $text = $title . ' ' . implode(' ', $headings) . ' ' . implode(' ', $paragraphs) . ' ' . implode(' ', $lists);
+        $text = preg_replace('/\s+/', ' ', $text);
+        return [
+            'url' => $url,
+            'title' => $title,
+            'headings' => $headings,
+            'paragraphs' => array_slice($paragraphs, 0, 10),
+            'lists' => array_slice($lists, 0, 20),
+            'text' => $text,
+        ];
+    }
+    protected function tokenize(string $text): array
+    {
+        $text = strtolower($text);
+        $text = preg_replace('/[^a-z0-9\s]/', ' ', $text);
+        $tokens = preg_split('/\s+/', $text);
+        $stopWords = $this->getStopWords();
+        return array_values(array_filter($tokens, function($token) use ($stopWords) {
+            return strlen($token) >= 3 && !in_array($token, $stopWords);
+        }));
+    }
+    protected function getStopWords(): array
+    {
+        return [
+            'the', 'and', 'for', 'are', 'but', 'not', 'you', 'all', 'can', 'had',
+            'her', 'was', 'one', 'our', 'out', 'day', 'get', 'has', 'him', 'his',
+            'how', 'its', 'may', 'now', 'old', 'see', 'than', 'that', 'this', 'too',
+            'was', 'will', 'with', 'have', 'from', 'they', 'been', 'were', 'said',
+            'each', 'which', 'their', 'what', 'when', 'where', 'who', 'will',
+            'about', 'after', 'would', 'there', 'could', 'other', 'into', 'more',
+            'some', 'them', 'then', 'these', 'so', 'just', 'because', 'being',
+            'also', 'before', 'here', 'how', 'made', 'make', 'only', 'over',
+            'such', 'take', 'through', 'under', 'very', 'what', 'your', 'like',
+        ];
+    }
+    public function search(array $terms, int $limit = 5): array
+    {
+        if (empty($terms) || $this->totalDocuments === 0) {
+            return [];
+        }
+        $scores = [];
+        foreach ($this->documents as $url => $doc) {
+            if (!isset($this->termFrequency[$url])) {
+                continue;
+            }
+            $score = 0;
+            $matchedTerms = [];
+            foreach ($terms as $term) {
+                $term = strtolower(trim($term));
+                if (strlen($term) < 3) continue;
+                if (isset($this->termFrequency[$url][$term])) {
+                    $tf = $this->termFrequency[$url][$term];
+                    $df = $this->documentFrequency[$term] ?? 1;
+                    $idf = log($this->totalDocuments / $df);
+                    $score += $tf * $idf;
+                    $matchedTerms[] = $term;
+                }
+            }
+            if ($score > 0) {
+                $scores[$url] = [
+                    'score' => $score,
+                    'matched_terms' => $matchedTerms,
+                    'doc' => $doc,
+                ];
+            }
+        }
+        uasort($scores, fn($a, $b) => $b['score'] <=> $a['score']);
+        $results = [];
+        foreach (array_slice($scores, 0, $limit, true) as $url => $data) {
+            $results[] = [
+                'url' => $url,
+                'title' => $data['doc']['title'] ?? basename($url),
+                'matched_terms' => $data['matched_terms'],
+                'score' => $data['score'],
+                'snippet' => $this->generateSnippet($data['doc'], $data['matched_terms']),
+            ];
+        }
+        return $results;
+    }
+    protected function generateSnippet(array $doc, array $terms): string
+    {
+        $text = implode(' ', $doc['paragraphs'] ?? []);
+        if (empty($text)) {
+            $text = implode(' ', $doc['lists'] ?? []);
+        }
+        if (empty($text)) {
+            return '';
+        }
+        $text = strtolower($text);
+        $bestPos = -1;
+        $bestLen = PHP_INT_MAX;
+        foreach ($terms as $term) {
+            $term = strtolower($term);
+            $pos = strpos($text, $term);
+            if ($pos !== false && $pos < $bestLen) {
+                $bestPos = $pos;
+                $bestLen = $pos;
+            }
+        }
+        if ($bestPos === -1) {
+            $bestPos = 0;
+        }
+        $start = max(0, $bestPos - 50);
+        $snippet = substr($text, $start, 200);
+        if ($start > 0) {
+            $snippet = '...' . $snippet;
+        }
+        $snippet = preg_replace('/\s+/', ' ', $snippet);
+        return trim($snippet) . '...';
+    }
+    public function toArray(): array
+    {
+        return [
+            'documents' => $this->documents,
+            'term_frequency' => $this->termFrequency,
+            'document_frequency' => $this->documentFrequency,
+            'total_documents' => $this->totalDocuments,
+        ];
+    }
+    public static function fromArray(array $data): self
+    {
+        $indexer = new self();
+        $indexer->documents = $data['documents'] ?? [];
+        $indexer->termFrequency = $data['term_frequency'] ?? [];
+        $indexer->documentFrequency = $data['document_frequency'] ?? [];
+        $indexer->totalDocuments = $data['total_documents'] ?? 0;
+        return $indexer;
+    }
+}

package/src/Services/LwaziService.php CHANGED Viewed

@@ -76,6 +76,12 @@ class LwaziService
             }
         }
+        $contentResponse = $this->searchContent($message);
+        if ($contentResponse) {
+            $this->conversationHistory[] = ['role' => 'assistant', 'content' => $contentResponse];
+            return $contentResponse;
+        }
         $dataResponse = $this->fetchRelevantData($message);
         if ($dataResponse) {
             $this->conversationHistory[] = ['role' => 'assistant', 'content' => $dataResponse];
@@ -217,6 +223,80 @@ class LwaziService
         return null;
     }
+    protected function searchContent(string $message): ?string
+    {
+        $contentPath = storage_path('lwazi/content_index.json');
+        if (!file_exists($contentPath)) {
+            return null;
+        }
+        $data = json_decode(file_get_contents($contentPath), true);
+        if (!$data || empty($data['documents'])) {
+            return null;
+        }
+        $indexer = ContentIndexer::fromArray($data);
+        $terms = $this->extractSearchTerms($message);
+        if (empty($terms)) {
+            return null;
+        }
+        $results = $indexer->search($terms, 3);
+        if (empty($results)) {
+            return null;
+        }
+        $response = "I found some relevant information:\n\n";
+        foreach ($results as $result) {
+            $response .= "**{$result['title']}**\n";
+            $response .= "{$result['snippet']}\n";
+            $response .= "[Read more]({$result['url']})\n\n";
+        }
+        return $response;
+    }
+    protected function extractSearchTerms(string $message): array
+    {
+        preg_match('/\b(my|the|their|our|all|this|these)\s+(\w+)/i', $message, $matches);
+        $topic = $matches[2] ?? '';
+        if (empty($topic) || strlen($topic) < 3) {
+            $words = preg_split('/\s+/', strtolower($message));
+            $stopWords = ['where', 'can', 'find', 'get', 'look', 'show', 'list', 'tell', 'know', 'want', 'need', 'help', 'give', 'me', 'i', 'is', 'are', 'was', 'were', 'has', 'have', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'what', 'how', 'why'];
+            foreach ($words as $w) {
+                if (strlen($w) >= 4 && !in_array($w, $stopWords)) {
+                    $topic = $w;
+                    break;
+                }
+            }
+        }
+        if (empty($topic) || strlen($topic) < 3) {
+            return [];
+        }
+        $prompt = "List 5 synonyms for: {$topic}. Return ONLY JSON array like: [\"word1\",\"word2\"]";
+        $response = $this->callOllama([
+            ['role' => 'system', 'content' => 'Return only valid JSON.'],
+            ['role' => 'user', 'content' => $prompt],
+        ]);
+        $json = $this->extractJson($response['content'] ?? '');
+        $terms = is_array($json) ? array_values($json) : [];
+        $first = explode(' ', $topic)[0];
+        if (strlen($first) >= 3) {
+            $terms[] = strtolower($first);
+        }
+        return $terms;
+    }
     protected function findBestRouteWithTerms(array $routes, array $terms): ?string
     {
         $scored = [];