@crawlith/core 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. package/CHANGELOG.md +6 -0
  2. package/dist/analysis/analysis_list.html +35 -0
  3. package/dist/analysis/analysis_page.html +123 -0
  4. package/dist/analysis/analyze.d.ts +17 -3
  5. package/dist/analysis/analyze.js +192 -248
  6. package/dist/analysis/scoring.js +7 -1
  7. package/dist/analysis/templates.d.ts +2 -0
  8. package/dist/analysis/templates.js +7 -0
  9. package/dist/core/security/ipGuard.d.ts +11 -0
  10. package/dist/core/security/ipGuard.js +71 -3
  11. package/dist/crawler/crawl.d.ts +4 -22
  12. package/dist/crawler/crawl.js +4 -335
  13. package/dist/crawler/crawler.d.ts +75 -0
  14. package/dist/crawler/crawler.js +518 -0
  15. package/dist/crawler/extract.d.ts +4 -1
  16. package/dist/crawler/extract.js +7 -2
  17. package/dist/crawler/fetcher.d.ts +1 -0
  18. package/dist/crawler/fetcher.js +20 -5
  19. package/dist/crawler/metricsRunner.d.ts +3 -1
  20. package/dist/crawler/metricsRunner.js +55 -46
  21. package/dist/crawler/sitemap.d.ts +3 -0
  22. package/dist/crawler/sitemap.js +5 -1
  23. package/dist/db/graphLoader.js +32 -3
  24. package/dist/db/index.d.ts +3 -0
  25. package/dist/db/index.js +4 -0
  26. package/dist/db/repositories/EdgeRepository.d.ts +8 -0
  27. package/dist/db/repositories/EdgeRepository.js +13 -0
  28. package/dist/db/repositories/MetricsRepository.d.ts +3 -0
  29. package/dist/db/repositories/MetricsRepository.js +14 -1
  30. package/dist/db/repositories/PageRepository.d.ts +11 -0
  31. package/dist/db/repositories/PageRepository.js +112 -19
  32. package/dist/db/repositories/SiteRepository.d.ts +3 -0
  33. package/dist/db/repositories/SiteRepository.js +9 -0
  34. package/dist/db/repositories/SnapshotRepository.d.ts +2 -0
  35. package/dist/db/repositories/SnapshotRepository.js +23 -2
  36. package/dist/events.d.ts +48 -0
  37. package/dist/events.js +1 -0
  38. package/dist/graph/cluster.js +62 -14
  39. package/dist/graph/duplicate.js +242 -191
  40. package/dist/graph/graph.d.ts +16 -0
  41. package/dist/graph/graph.js +17 -4
  42. package/dist/graph/metrics.js +12 -0
  43. package/dist/graph/pagerank.js +2 -0
  44. package/dist/graph/simhash.d.ts +6 -0
  45. package/dist/graph/simhash.js +14 -0
  46. package/dist/index.d.ts +5 -2
  47. package/dist/index.js +5 -2
  48. package/dist/lock/hashKey.js +1 -1
  49. package/dist/lock/lockManager.d.ts +4 -1
  50. package/dist/lock/lockManager.js +23 -13
  51. package/{src/report/sitegraph_template.ts → dist/report/crawl.html} +330 -81
  52. package/dist/report/crawlExport.d.ts +3 -0
  53. package/dist/report/{sitegraphExport.js → crawlExport.js} +3 -3
  54. package/dist/report/crawl_template.d.ts +1 -0
  55. package/dist/report/crawl_template.js +7 -0
  56. package/dist/report/html.js +15 -216
  57. package/dist/scoring/health.d.ts +50 -0
  58. package/dist/scoring/health.js +170 -0
  59. package/dist/scoring/hits.d.ts +1 -0
  60. package/dist/scoring/hits.js +64 -44
  61. package/dist/scoring/orphanSeverity.d.ts +5 -5
  62. package/package.json +3 -3
  63. package/scripts/copy-assets.js +37 -0
  64. package/src/analysis/analysis_list.html +35 -0
  65. package/src/analysis/analysis_page.html +123 -0
  66. package/src/analysis/analyze.ts +218 -261
  67. package/src/analysis/scoring.ts +8 -1
  68. package/src/analysis/templates.ts +9 -0
  69. package/src/core/security/ipGuard.ts +82 -3
  70. package/src/crawler/crawl.ts +6 -379
  71. package/src/crawler/crawler.ts +601 -0
  72. package/src/crawler/extract.ts +7 -2
  73. package/src/crawler/fetcher.ts +24 -6
  74. package/src/crawler/metricsRunner.ts +60 -47
  75. package/src/crawler/sitemap.ts +4 -1
  76. package/src/db/graphLoader.ts +33 -3
  77. package/src/db/index.ts +5 -0
  78. package/src/db/repositories/EdgeRepository.ts +14 -0
  79. package/src/db/repositories/MetricsRepository.ts +15 -1
  80. package/src/db/repositories/PageRepository.ts +119 -19
  81. package/src/db/repositories/SiteRepository.ts +11 -0
  82. package/src/db/repositories/SnapshotRepository.ts +28 -3
  83. package/src/events.ts +16 -0
  84. package/src/graph/cluster.ts +69 -15
  85. package/src/graph/duplicate.ts +249 -185
  86. package/src/graph/graph.ts +24 -4
  87. package/src/graph/metrics.ts +15 -0
  88. package/src/graph/pagerank.ts +1 -0
  89. package/src/graph/simhash.ts +15 -0
  90. package/src/index.ts +5 -2
  91. package/src/lock/hashKey.ts +1 -1
  92. package/src/lock/lockManager.ts +21 -13
  93. package/{dist/report/sitegraph_template.js → src/report/crawl.html} +330 -81
  94. package/src/report/{sitegraphExport.ts → crawlExport.ts} +3 -3
  95. package/src/report/crawl_template.ts +9 -0
  96. package/src/report/html.ts +17 -217
  97. package/src/scoring/health.ts +241 -0
  98. package/src/scoring/hits.ts +67 -45
  99. package/src/scoring/orphanSeverity.ts +8 -8
  100. package/tests/analysis.unit.test.ts +44 -0
  101. package/tests/analyze.integration.test.ts +88 -53
  102. package/tests/analyze_markdown.test.ts +98 -0
  103. package/tests/audit/audit.test.ts +101 -0
  104. package/tests/audit/scoring.test.ts +25 -25
  105. package/tests/audit/transport.test.ts +0 -1
  106. package/tests/clustering_risk.test.ts +118 -0
  107. package/tests/crawler.test.ts +19 -13
  108. package/tests/db/index.test.ts +134 -0
  109. package/tests/db/repositories.test.ts +115 -0
  110. package/tests/db_repos.test.ts +72 -0
  111. package/tests/duplicate.test.ts +2 -2
  112. package/tests/extract.test.ts +86 -0
  113. package/tests/fetcher.test.ts +5 -1
  114. package/tests/fetcher_safety.test.ts +9 -3
  115. package/tests/graph/graph.test.ts +100 -0
  116. package/tests/graphLoader.test.ts +124 -0
  117. package/tests/html_report.test.ts +52 -51
  118. package/tests/ipGuard.test.ts +73 -0
  119. package/tests/lock/lockManager.test.ts +77 -17
  120. package/tests/normalize.test.ts +6 -19
  121. package/tests/orphanSeverity.test.ts +9 -9
  122. package/tests/redirect_safety.test.ts +5 -1
  123. package/tests/renderAnalysisCsv.test.ts +183 -0
  124. package/tests/safety.test.ts +12 -0
  125. package/tests/scope.test.ts +18 -0
  126. package/tests/scoring.test.ts +25 -24
  127. package/tests/sitemap.test.ts +13 -1
  128. package/tests/ssrf_fix.test.ts +69 -0
  129. package/tests/visualization_data.test.ts +10 -10
  130. package/dist/report/sitegraphExport.d.ts +0 -3
  131. package/dist/report/sitegraph_template.d.ts +0 -1
package/CHANGELOG.md CHANGED
@@ -1,5 +1,11 @@
1
1
  # @crawlith/core
2
2
 
3
+ ## 0.1.1
4
+
5
+ ### Patch Changes
6
+
7
+ - Bump all packages to next patch version.
8
+
3
9
  ## 0.1.0
4
10
 
5
11
  ### Minor Changes
@@ -0,0 +1,35 @@
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="utf-8" />
5
+ <title>Crawlith Analysis Report</title>
6
+ <style>
7
+ body { font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif; max-width: 1000px; margin: 0 auto; padding: 20px; color: #333; }
8
+ h1 { border-bottom: 2px solid #eee; padding-bottom: 10px; }
9
+ table { width: 100%; border-collapse: collapse; margin-top: 20px; }
10
+ th, td { padding: 8px 12px; border: 1px solid #ddd; text-align: left; }
11
+ th { background-color: #f4f4f4; }
12
+ tr:nth-child(even) { background-color: #f9f9f9; }
13
+ tr:hover { background-color: #f1f1f1; }
14
+ </style>
15
+ </head>
16
+ <body>
17
+ <h1>Analysis</h1>
18
+ <p>Pages: {{PAGES_ANALYZED}}</p>
19
+ <p>Average SEO: {{AVG_SEO_SCORE}}</p>
20
+ <table border="1" cellspacing="0" cellpadding="6">
21
+ <thead>
22
+ <tr>
23
+ <th>URL</th>
24
+ <th>SEO Score</th>
25
+ <th>Thin Score</th>
26
+ <th>Title</th>
27
+ <th>Meta</th>
28
+ </tr>
29
+ </thead>
30
+ <tbody>
31
+ {{ROWS}}
32
+ </tbody>
33
+ </table>
34
+ </body>
35
+ </html>
@@ -0,0 +1,123 @@
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>Analysis for {{URL}}</title>
7
+ <style>
8
+ body { font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif; max-width: 800px; margin: 0 auto; padding: 20px; line-height: 1.6; color: #333; }
9
+ h1 { border-bottom: 2px solid #eee; padding-bottom: 10px; }
10
+ h2 { margin-top: 30px; border-bottom: 1px solid #eee; padding-bottom: 5px; }
11
+ .score-card { display: flex; gap: 20px; margin-bottom: 30px; }
12
+ .score-box { background: #f8f9fa; padding: 15px; border-radius: 8px; text-align: center; flex: 1; border: 1px solid #e1e4e8; }
13
+ .score-val { font-size: 24px; font-weight: bold; color: #0366d6; }
14
+ .status-ok { color: green; font-weight: bold; }
15
+ .status-warning { color: orange; font-weight: bold; }
16
+ .status-critical { color: red; font-weight: bold; }
17
+ .status-missing { color: red; font-weight: bold; }
18
+ .data-table { width: 100%; border-collapse: collapse; margin-top: 10px; }
19
+ .data-table th, .data-table td { text-align: left; padding: 8px; border-bottom: 1px solid #eee; }
20
+ .data-table th { width: 150px; color: #666; }
21
+ code { background: #f6f8fa; padding: 2px 4px; border-radius: 3px; font-size: 0.9em; }
22
+ </style>
23
+ </head>
24
+ <body>
25
+ <h1>Page Analysis</h1>
26
+ <p><strong>URL:</strong> <a href="{{URL}}" target="_blank">{{URL}}</a></p>
27
+
28
+ <div class="score-card">
29
+ <div class="score-box">
30
+ <div class="score-val">{{SEO_SCORE}}</div>
31
+ <div>SEO Score</div>
32
+ </div>
33
+ <div class="score-box">
34
+ <div class="score-val">{{THIN_SCORE}}</div>
35
+ <div>Thin Content Score</div>
36
+ </div>
37
+ <div class="score-box">
38
+ <div class="score-val">{{HTTP_STATUS}}</div>
39
+ <div>HTTP Status</div>
40
+ </div>
41
+ </div>
42
+
43
+ <h2>Meta Tags</h2>
44
+ <table class="data-table">
45
+ <tr>
46
+ <th>Title</th>
47
+ <td>
48
+ <div>{{TITLE_VALUE}}</div>
49
+ <small>Length: {{TITLE_LENGTH}} | Status: <span class="status-{{TITLE_STATUS}}">{{TITLE_STATUS}}</span></small>
50
+ </td>
51
+ </tr>
52
+ <tr>
53
+ <th>Description</th>
54
+ <td>
55
+ <div>{{META_DESCRIPTION_VALUE}}</div>
56
+ <small>Length: {{META_DESCRIPTION_LENGTH}} | Status: <span class="status-{{META_DESCRIPTION_STATUS}}">{{META_DESCRIPTION_STATUS}}</span></small>
57
+ </td>
58
+ </tr>
59
+ <tr>
60
+ <th>Canonical</th>
61
+ <td>{{CANONICAL}}</td>
62
+ </tr>
63
+ <tr>
64
+ <th>Robots</th>
65
+ <td>
66
+ Index: {{ROBOTS_INDEX}},
67
+ Follow: {{ROBOTS_FOLLOW}}
68
+ </td>
69
+ </tr>
70
+ </table>
71
+
72
+ <h2>Content & Heading</h2>
73
+ <table class="data-table">
74
+ <tr>
75
+ <th>H1 Tag</th>
76
+ <td>
77
+ Status: <span class="status-{{H1_STATUS}}">{{H1_STATUS}}</span>
78
+ ({{H1_COUNT}} detected)
79
+ {{H1_MATCHES_TITLE}}
80
+ </td>
81
+ </tr>
82
+ <tr>
83
+ <th>Word Count</th>
84
+ <td>{{WORD_COUNT}} words</td>
85
+ </tr>
86
+ <tr>
87
+ <th>Unique Sentences</th>
88
+ <td>{{UNIQUE_SENTENCES}}</td>
89
+ </tr>
90
+ <tr>
91
+ <th>Text / HTML Ratio</th>
92
+ <td>{{TEXT_HTML_RATIO}}%</td>
93
+ </tr>
94
+ </table>
95
+
96
+ <h2>Links & Images</h2>
97
+ <table class="data-table">
98
+ <tr>
99
+ <th>Internal Links</th>
100
+ <td>{{INTERNAL_LINKS}}</td>
101
+ </tr>
102
+ <tr>
103
+ <th>External Links</th>
104
+ <td>{{EXTERNAL_LINKS}} ({{EXTERNAL_RATIO}}%)</td>
105
+ </tr>
106
+ <tr>
107
+ <th>Images</th>
108
+ <td>{{TOTAL_IMAGES}} total ({{MISSING_ALT}} missing alt text)</td>
109
+ </tr>
110
+ </table>
111
+
112
+ <h2>Structured Data</h2>
113
+ <table class="data-table">
114
+ <tr>
115
+ <th>Status</th>
116
+ <td>
117
+ {{STRUCTURED_DATA_STATUS}}
118
+ </td>
119
+ </tr>
120
+ {{STRUCTURED_DATA_TYPES_ROW}}
121
+ </table>
122
+ </body>
123
+ </html>
@@ -5,6 +5,7 @@ import { ImageAltAnalysis } from './images.js';
5
5
  import { LinkRatioAnalysis } from './links.js';
6
6
  import { StructuredDataResult } from './structuredData.js';
7
7
  import { aggregateSiteScore } from './scoring.js';
8
+ import { EngineContext } from '../events.js';
8
9
  export interface CrawlPage {
9
10
  url: string;
10
11
  status?: number;
@@ -13,11 +14,10 @@ export interface CrawlPage {
13
14
  canonical?: string;
14
15
  noindex?: boolean;
15
16
  nofollow?: boolean;
17
+ crawlStatus?: string;
16
18
  }
17
19
  export interface AnalyzeOptions {
18
- fromCrawl?: string;
19
20
  live?: boolean;
20
- html?: boolean;
21
21
  seo?: boolean;
22
22
  content?: boolean;
23
23
  accessibility?: boolean;
@@ -28,6 +28,7 @@ export interface AnalyzeOptions {
28
28
  debug?: boolean;
29
29
  clusterThreshold?: number;
30
30
  minClusterSize?: number;
31
+ allPages?: boolean;
31
32
  }
32
33
  export interface PageAnalysis {
33
34
  url: string;
@@ -45,6 +46,7 @@ export interface PageAnalysis {
45
46
  canonical?: string;
46
47
  noindex?: boolean;
47
48
  nofollow?: boolean;
49
+ crawlStatus?: string;
48
50
  };
49
51
  }
50
52
  export interface AnalysisResult {
@@ -63,8 +65,20 @@ export interface AnalysisResult {
63
65
  accessibility: boolean;
64
66
  };
65
67
  clusters?: ClusterInfo[];
68
+ snapshotId?: number;
69
+ crawledAt?: string;
66
70
  }
67
- export declare function analyzeSite(url: string, options: AnalyzeOptions): Promise<AnalysisResult>;
71
+ /**
72
+ * Analyzes a site for SEO, content, and accessibility.
73
+ * Supports live crawling or loading from a database snapshot.
74
+ * Note: File-based data loading is not supported.
75
+ *
76
+ * @param url The root URL to analyze
77
+ * @param options Analysis options
78
+ * @param context Engine context for event emission
79
+ */
80
+ export declare function analyzeSite(url: string, options: AnalyzeOptions, context?: EngineContext): Promise<AnalysisResult>;
68
81
  export declare function renderAnalysisHtml(result: AnalysisResult): string;
69
82
  export declare function renderAnalysisMarkdown(result: AnalysisResult): string;
70
83
  export declare function renderAnalysisCsv(result: AnalysisResult): string;
84
+ export declare function analyzePages(rootUrl: string, pages: Iterable<CrawlPage> | CrawlPage[], robots?: any): PageAnalysis[];