lwazi 1.7.0 → 1.7.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json
CHANGED
|
@@ -6,6 +6,9 @@ use Illuminate\Console\Command;
|
|
|
6
6
|
use Lwazi\Core\Installer\NavigationCrawler;
|
|
7
7
|
use Lwazi\Core\Services\GraphVisualizer;
|
|
8
8
|
use Illuminate\Support\Facades\Storage;
|
|
9
|
+
use Illuminate\Support\Str;
|
|
10
|
+
use DOMDocument;
|
|
11
|
+
use DOMXPath;
|
|
9
12
|
|
|
10
13
|
class AnalyzeProjectCommand extends Command
|
|
11
14
|
{
|
|
@@ -26,6 +29,11 @@ class AnalyzeProjectCommand extends Command
|
|
|
26
29
|
$crawler = new NavigationCrawler($rootUrl);
|
|
27
30
|
$manifest = $crawler->crawl();
|
|
28
31
|
|
|
32
|
+
if (empty($manifest['nodes'] ?? [])) {
|
|
33
|
+
$this->info("Crawler returned no pages, trying fallback...");
|
|
34
|
+
$manifest = $this->fallbackExtractLinks($rootUrl);
|
|
35
|
+
}
|
|
36
|
+
|
|
29
37
|
$manifest = $this->mergeWithRoutes($manifest, $rootUrl);
|
|
30
38
|
|
|
31
39
|
$storagePath = storage_path('lwazi');
|
|
@@ -37,7 +45,7 @@ class AnalyzeProjectCommand extends Command
|
|
|
37
45
|
file_put_contents($manifestFile, json_encode($manifest, JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES));
|
|
38
46
|
|
|
39
47
|
$this->info("Analysis complete. Manifest stored at: {$manifestFile}");
|
|
40
|
-
$this->info("Pages discovered: " . count($manifest['nodes']));
|
|
48
|
+
$this->info("Pages discovered: " . count($manifest['nodes'] ?? $manifest['flat'] ?? []));
|
|
41
49
|
|
|
42
50
|
$this->info("\n" . str_repeat('=', 50));
|
|
43
51
|
$this->info('SITE NAVIGATION GRAPH');
|
|
@@ -138,4 +146,90 @@ class AnalyzeProjectCommand extends Command
|
|
|
138
146
|
$path = preg_replace('/\s+/', ' ', $path);
|
|
139
147
|
return ucwords(trim($path));
|
|
140
148
|
}
|
|
149
|
+
|
|
150
|
+
protected function fallbackExtractLinks(string $url): array
|
|
151
|
+
{
|
|
152
|
+
try {
|
|
153
|
+
$response = \Illuminate\Support\Facades\Http::timeout(10)->get($url);
|
|
154
|
+
if (!$response->successful()) {
|
|
155
|
+
return ['nodes' => [], 'adjacency' => [], 'flat' => [], 'root_url' => $url];
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
$html = $response->body();
|
|
159
|
+
|
|
160
|
+
libxml_use_internal_errors(true);
|
|
161
|
+
$dom = new DOMDocument();
|
|
162
|
+
@$dom->loadHTML($html);
|
|
163
|
+
libxml_clear_errors();
|
|
164
|
+
|
|
165
|
+
$xpath = new DOMXPath($dom);
|
|
166
|
+
$links = $xpath->query('//a[@href]');
|
|
167
|
+
|
|
168
|
+
$manifest = [
|
|
169
|
+
'nodes' => [],
|
|
170
|
+
'adjacency' => [$url => []],
|
|
171
|
+
'flat' => [],
|
|
172
|
+
'root_url' => $url,
|
|
173
|
+
];
|
|
174
|
+
|
|
175
|
+
$seen = [];
|
|
176
|
+
|
|
177
|
+
foreach ($links as $a) {
|
|
178
|
+
$href = trim($a->getAttribute('href'));
|
|
179
|
+
if (!$href || Str::startsWith($href, ['#', 'javascript:', 'mailto:', 'tel:'])) {
|
|
180
|
+
continue;
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
$fullUrl = $this->makeAbsolute($href, $url);
|
|
184
|
+
if (!$fullUrl || !Str::startsWith($fullUrl, $url)) {
|
|
185
|
+
continue;
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
if (isset($seen[$fullUrl])) {
|
|
189
|
+
continue;
|
|
190
|
+
}
|
|
191
|
+
$seen[$fullUrl] = true;
|
|
192
|
+
|
|
193
|
+
$text = trim($a->textContent ?? '');
|
|
194
|
+
|
|
195
|
+
$manifest['nodes'][$fullUrl] = [
|
|
196
|
+
'url' => $fullUrl,
|
|
197
|
+
'title' => $text ?: basename($fullUrl),
|
|
198
|
+
'headings' => [$text],
|
|
199
|
+
];
|
|
200
|
+
|
|
201
|
+
$manifest['adjacency'][$url][] = $fullUrl;
|
|
202
|
+
|
|
203
|
+
$manifest['flat'][$fullUrl] = [
|
|
204
|
+
'label' => $text ?: basename($fullUrl),
|
|
205
|
+
'segments' => array_filter(explode('/', parse_url($fullUrl, PHP_PATH) ?? '')),
|
|
206
|
+
'_path' => $fullUrl,
|
|
207
|
+
'_weight' => 1,
|
|
208
|
+
];
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
return $manifest;
|
|
212
|
+
|
|
213
|
+
} catch (\Exception $e) {
|
|
214
|
+
return ['nodes' => [], 'adjacency' => [], 'flat' => [], 'root_url' => $url];
|
|
215
|
+
}
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
protected function makeAbsolute(string $href, string $base): string
|
|
219
|
+
{
|
|
220
|
+
if (Str::startsWith($href, ['http://', 'https://'])) {
|
|
221
|
+
return $href;
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
if ($href === '/') {
|
|
225
|
+
return $base;
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
if (Str::startsWith($href, '/')) {
|
|
229
|
+
$parsed = parse_url($base);
|
|
230
|
+
return ($parsed['scheme'] ?? 'http') . '://' . ($parsed['host'] ?? '') . $href;
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
return $base . '/' . $href;
|
|
234
|
+
}
|
|
141
235
|
}
|
|
@@ -108,15 +108,21 @@ class SetupCommand extends Command
|
|
|
108
108
|
protected function runCrawler(string $url): void
|
|
109
109
|
{
|
|
110
110
|
$this->info("\nCrawling website: {$url}...");
|
|
111
|
+
|
|
111
112
|
try {
|
|
112
113
|
$crawler = new NavigationCrawler($url, true);
|
|
113
114
|
$manifest = $crawler->crawl();
|
|
114
115
|
|
|
116
|
+
if (empty($manifest['nodes'] ?? [])) {
|
|
117
|
+
$this->info("Crawler returned no pages, trying fallback extraction...");
|
|
118
|
+
$manifest = $this->fallbackExtractLinks($url);
|
|
119
|
+
}
|
|
120
|
+
|
|
115
121
|
$manifest = $this->mergeWithRoutes($manifest, $url);
|
|
116
122
|
|
|
117
123
|
$crawler->saveManifest();
|
|
118
124
|
|
|
119
|
-
$this->info("Crawled " . count($manifest['nodes']) . " pages.");
|
|
125
|
+
$this->info("Crawled " . count($manifest['nodes'] ?? $manifest['flat'] ?? []) . " pages.");
|
|
120
126
|
|
|
121
127
|
$this->info("\n" . str_repeat('=', 50));
|
|
122
128
|
$this->info('SITE NAVIGATION GRAPH');
|
|
@@ -130,8 +136,107 @@ class SetupCommand extends Command
|
|
|
130
136
|
$this->line($summaryOutput);
|
|
131
137
|
} catch (\Throwable $e) {
|
|
132
138
|
$this->warn('Website crawling failed: ' . $e->getMessage());
|
|
139
|
+
|
|
140
|
+
$this->info("Trying fallback link extraction...");
|
|
141
|
+
try {
|
|
142
|
+
$manifest = $this->fallbackExtractLinks($url);
|
|
143
|
+
$manifest = $this->mergeWithRoutes($manifest, $url);
|
|
144
|
+
|
|
145
|
+
$crawler = new NavigationCrawler($url);
|
|
146
|
+
$crawler->saveManifest();
|
|
147
|
+
|
|
148
|
+
$this->info("Fallback extraction complete.");
|
|
149
|
+
} catch (\Throwable $e2) {
|
|
150
|
+
$this->warn('Fallback also failed: ' . $e2->getMessage());
|
|
151
|
+
}
|
|
133
152
|
}
|
|
134
153
|
}
|
|
154
|
+
|
|
155
|
+
protected function fallbackExtractLinks(string $url): array
|
|
156
|
+
{
|
|
157
|
+
try {
|
|
158
|
+
$response = \Illuminate\Support\Facades\Http::timeout(10)->get($url);
|
|
159
|
+
if (!$response->successful()) {
|
|
160
|
+
return ['nodes' => [], 'adjacency' => [], 'flat' => [], 'root_url' => $url];
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
$html = $response->body();
|
|
164
|
+
|
|
165
|
+
libxml_use_internal_errors(true);
|
|
166
|
+
$dom = new DOMDocument();
|
|
167
|
+
@$dom->loadHTML($html);
|
|
168
|
+
libxml_clear_errors();
|
|
169
|
+
|
|
170
|
+
$xpath = new DOMXPath($dom);
|
|
171
|
+
$links = $xpath->query('//a[@href]');
|
|
172
|
+
|
|
173
|
+
$manifest = [
|
|
174
|
+
'nodes' => [],
|
|
175
|
+
'adjacency' => [$url => []],
|
|
176
|
+
'flat' => [],
|
|
177
|
+
'root_url' => $url,
|
|
178
|
+
];
|
|
179
|
+
|
|
180
|
+
$seen = [];
|
|
181
|
+
|
|
182
|
+
foreach ($links as $a) {
|
|
183
|
+
$href = trim($a->getAttribute('href'));
|
|
184
|
+
if (!$href || Str::startsWith($href, ['#', 'javascript:', 'mailto:', 'tel:'])) {
|
|
185
|
+
continue;
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
$fullUrl = $this->makeAbsolute($href, $url);
|
|
189
|
+
if (!$fullUrl || !Str::startsWith($fullUrl, $url)) {
|
|
190
|
+
continue;
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
if (isset($seen[$fullUrl])) {
|
|
194
|
+
continue;
|
|
195
|
+
}
|
|
196
|
+
$seen[$fullUrl] = true;
|
|
197
|
+
|
|
198
|
+
$text = trim($a->textContent ?? '');
|
|
199
|
+
|
|
200
|
+
$manifest['nodes'][$fullUrl] = [
|
|
201
|
+
'url' => $fullUrl,
|
|
202
|
+
'title' => $text ?: basename($fullUrl),
|
|
203
|
+
'headings' => [$text],
|
|
204
|
+
];
|
|
205
|
+
|
|
206
|
+
$manifest['adjacency'][$url][] = $fullUrl;
|
|
207
|
+
|
|
208
|
+
$manifest['flat'][$fullUrl] = [
|
|
209
|
+
'label' => $text ?: basename($fullUrl),
|
|
210
|
+
'segments' => array_filter(explode('/', parse_url($fullUrl, PHP_PATH) ?? '')),
|
|
211
|
+
'_path' => $fullUrl,
|
|
212
|
+
'_weight' => 1,
|
|
213
|
+
];
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
return $manifest;
|
|
217
|
+
|
|
218
|
+
} catch (\Exception $e) {
|
|
219
|
+
return ['nodes' => [], 'adjacency' => [], 'flat' => [], 'root_url' => $url];
|
|
220
|
+
}
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
protected function makeAbsolute(string $href, string $base): string
|
|
224
|
+
{
|
|
225
|
+
if (Str::startsWith($href, ['http://', 'https://'])) {
|
|
226
|
+
return $href;
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
if ($href === '/') {
|
|
230
|
+
return $base;
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
if (Str::startsWith($href, '/')) {
|
|
234
|
+
$parsed = parse_url($base);
|
|
235
|
+
return ($parsed['scheme'] ?? 'http') . '://' . ($parsed['host'] ?? '') . $href;
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
return $base . '/' . $href;
|
|
239
|
+
}
|
|
135
240
|
|
|
136
241
|
protected function mergeWithRoutes(array $manifest, string $rootUrl): array
|
|
137
242
|
{
|
|
@@ -189,8 +189,20 @@ class LwaziService
|
|
|
189
189
|
]);
|
|
190
190
|
|
|
191
191
|
$json = $this->extractJson($response2['content'] ?? '');
|
|
192
|
-
$
|
|
193
|
-
|
|
192
|
+
$rawTerms = is_array($json) ? array_values($json) : [];
|
|
193
|
+
|
|
194
|
+
$terms = [];
|
|
195
|
+
foreach ($rawTerms as $t) {
|
|
196
|
+
$t = strtolower(trim($t));
|
|
197
|
+
if (strpos($t, ' ') === false && strlen($t) >= 3) {
|
|
198
|
+
$terms[] = $t;
|
|
199
|
+
}
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
$first = explode(' ', $topic)[0];
|
|
203
|
+
if (strlen($first) >= 3) {
|
|
204
|
+
$terms[] = strtolower($first);
|
|
205
|
+
}
|
|
194
206
|
|
|
195
207
|
if (empty($terms)) {
|
|
196
208
|
return null;
|
|
@@ -358,8 +370,20 @@ class LwaziService
|
|
|
358
370
|
]);
|
|
359
371
|
|
|
360
372
|
$json = $this->extractJson($response2['content'] ?? '');
|
|
361
|
-
$
|
|
362
|
-
|
|
373
|
+
$rawTerms = is_array($json) ? array_values($json) : [];
|
|
374
|
+
|
|
375
|
+
$terms = [];
|
|
376
|
+
foreach ($rawTerms as $t) {
|
|
377
|
+
$t = strtolower(trim($t));
|
|
378
|
+
if (strpos($t, ' ') === false && strlen($t) >= 3) {
|
|
379
|
+
$terms[] = $t;
|
|
380
|
+
}
|
|
381
|
+
}
|
|
382
|
+
|
|
383
|
+
$first = explode(' ', $topic)[0];
|
|
384
|
+
if (strlen($first) >= 3) {
|
|
385
|
+
$terms[] = strtolower($first);
|
|
386
|
+
}
|
|
363
387
|
|
|
364
388
|
if (empty($terms)) {
|
|
365
389
|
return null;
|