@gscdump/engine 0.11.1 → 0.11.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,6 @@
1
1
  var AnalyzerCapabilityError = class extends Error {
2
+ tool;
3
+ missing;
2
4
  constructor(tool, missing) {
3
5
  super(`analyzer "${tool}" requires capabilities [${missing.join(", ")}] not provided by source`);
4
6
  this.tool = tool;
package/dist/entities.mjs CHANGED
@@ -26,6 +26,88 @@ function hashUrl(url) {
26
26
  }
27
27
  return (hi >>> 0).toString(16).padStart(8, "0") + (lo >>> 0).toString(16).padStart(8, "0");
28
28
  }
29
+ const INSPECTION_PARQUET_COLUMNS = [
30
+ {
31
+ name: "urlHash",
32
+ type: "VARCHAR",
33
+ nullable: false
34
+ },
35
+ {
36
+ name: "url",
37
+ type: "VARCHAR",
38
+ nullable: false
39
+ },
40
+ {
41
+ name: "inspectedAt",
42
+ type: "VARCHAR",
43
+ nullable: false
44
+ },
45
+ {
46
+ name: "indexStatus",
47
+ type: "VARCHAR",
48
+ nullable: true
49
+ },
50
+ {
51
+ name: "lastCrawlTime",
52
+ type: "VARCHAR",
53
+ nullable: true
54
+ },
55
+ {
56
+ name: "googleCanonical",
57
+ type: "VARCHAR",
58
+ nullable: true
59
+ },
60
+ {
61
+ name: "userCanonical",
62
+ type: "VARCHAR",
63
+ nullable: true
64
+ },
65
+ {
66
+ name: "coverageState",
67
+ type: "VARCHAR",
68
+ nullable: true
69
+ },
70
+ {
71
+ name: "robotsTxtState",
72
+ type: "VARCHAR",
73
+ nullable: true
74
+ },
75
+ {
76
+ name: "indexingState",
77
+ type: "VARCHAR",
78
+ nullable: true
79
+ },
80
+ {
81
+ name: "pageFetchState",
82
+ type: "VARCHAR",
83
+ nullable: true
84
+ },
85
+ {
86
+ name: "mobileUsabilityVerdict",
87
+ type: "VARCHAR",
88
+ nullable: true
89
+ },
90
+ {
91
+ name: "richResultsVerdict",
92
+ type: "VARCHAR",
93
+ nullable: true
94
+ },
95
+ {
96
+ name: "scheduleNextAt",
97
+ type: "BIGINT",
98
+ nullable: true
99
+ },
100
+ {
101
+ name: "scheduleConsecutiveUnchanged",
102
+ type: "INTEGER",
103
+ nullable: true
104
+ },
105
+ {
106
+ name: "schedulePolicyVersion",
107
+ type: "INTEGER",
108
+ nullable: true
109
+ }
110
+ ];
29
111
  function createInspectionStore(opts) {
30
112
  const hash = opts.hash ?? hashUrl;
31
113
  const ds = opts.dataSource;
@@ -117,88 +199,6 @@ function createInspectionStore(opts) {
117
199
  }
118
200
  };
119
201
  }
120
- const INSPECTION_PARQUET_COLUMNS = [
121
- {
122
- name: "urlHash",
123
- type: "VARCHAR",
124
- nullable: false
125
- },
126
- {
127
- name: "url",
128
- type: "VARCHAR",
129
- nullable: false
130
- },
131
- {
132
- name: "inspectedAt",
133
- type: "VARCHAR",
134
- nullable: false
135
- },
136
- {
137
- name: "indexStatus",
138
- type: "VARCHAR",
139
- nullable: true
140
- },
141
- {
142
- name: "lastCrawlTime",
143
- type: "VARCHAR",
144
- nullable: true
145
- },
146
- {
147
- name: "googleCanonical",
148
- type: "VARCHAR",
149
- nullable: true
150
- },
151
- {
152
- name: "userCanonical",
153
- type: "VARCHAR",
154
- nullable: true
155
- },
156
- {
157
- name: "coverageState",
158
- type: "VARCHAR",
159
- nullable: true
160
- },
161
- {
162
- name: "robotsTxtState",
163
- type: "VARCHAR",
164
- nullable: true
165
- },
166
- {
167
- name: "indexingState",
168
- type: "VARCHAR",
169
- nullable: true
170
- },
171
- {
172
- name: "pageFetchState",
173
- type: "VARCHAR",
174
- nullable: true
175
- },
176
- {
177
- name: "mobileUsabilityVerdict",
178
- type: "VARCHAR",
179
- nullable: true
180
- },
181
- {
182
- name: "richResultsVerdict",
183
- type: "VARCHAR",
184
- nullable: true
185
- },
186
- {
187
- name: "scheduleNextAt",
188
- type: "BIGINT",
189
- nullable: true
190
- },
191
- {
192
- name: "scheduleConsecutiveUnchanged",
193
- type: "INTEGER",
194
- nullable: true
195
- },
196
- {
197
- name: "schedulePolicyVersion",
198
- type: "INTEGER",
199
- nullable: true
200
- }
201
- ];
202
202
  function sitemapIndexKey(ctx) {
203
203
  return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/sitemaps/index.json` : `u_${ctx.userId}/entities/sitemaps/index.json`;
204
204
  }
@@ -1,4 +1,4 @@
1
- import { a as DataSource } from "./_chunks/storage.mjs";
1
+ import { a as DataSource, c as FileSetRef } from "./_chunks/storage.mjs";
2
2
  import { t as ColumnDef } from "./_chunks/schema.mjs";
3
3
  import { TenantCtx } from "gscdump/contracts";
4
4
  import * as _$_gscdump_engine_contracts0 from "@gscdump/engine/contracts";
@@ -13,10 +13,7 @@ interface RollupCtx extends TenantCtx {
13
13
  interface RollupEngine {
14
14
  runSQL: (opts: {
15
15
  ctx: TenantCtx;
16
- fileSets: Record<string, {
17
- table: _$_gscdump_engine_contracts0.TableName;
18
- partitions?: string[];
19
- }>;
16
+ fileSets: Record<string, FileSetRef>;
20
17
  table?: _$_gscdump_engine_contracts0.TableName;
21
18
  sql: string;
22
19
  params?: unknown[];
package/dist/rollups.mjs CHANGED
@@ -343,17 +343,17 @@ const indexingHealthRollup = {
343
343
  if (!await dataSource.head?.(key)) return { days: [] };
344
344
  const sql = `
345
345
  SELECT
346
- substr(inspectedAt, 1, 10) AS date,
346
+ substr(CAST(inspectedAt AS VARCHAR), 1, 10) AS date,
347
347
  COUNT(*)::BIGINT AS total_urls,
348
- SUM(CASE WHEN indexStatus = 'PASS' THEN 1 ELSE 0 END)::BIGINT AS indexed_count,
349
- SUM(CASE WHEN pageFetchState = 'SOFT_404' THEN 1 ELSE 0 END)::BIGINT AS soft_404,
350
- SUM(CASE WHEN pageFetchState = 'REDIRECT_ERROR' THEN 1 ELSE 0 END)::BIGINT AS redirect,
351
- SUM(CASE WHEN pageFetchState = 'NOT_FOUND' THEN 1 ELSE 0 END)::BIGINT AS not_found,
352
- SUM(CASE WHEN mobileUsabilityVerdict = 'PASS' THEN 1 ELSE 0 END)::BIGINT AS mobile_passes,
353
- SUM(CASE WHEN richResultsVerdict = 'PASS' THEN 1 ELSE 0 END)::BIGINT AS rich_results_passes,
354
- SUM(CASE WHEN userCanonical IS NOT NULL AND googleCanonical IS NOT NULL AND userCanonical <> googleCanonical THEN 1 ELSE 0 END)::BIGINT AS canonical_mismatches
348
+ SUM(CASE WHEN CAST(indexStatus AS VARCHAR) = 'PASS' THEN 1 ELSE 0 END)::BIGINT AS indexed_count,
349
+ SUM(CASE WHEN CAST(pageFetchState AS VARCHAR) = 'SOFT_404' THEN 1 ELSE 0 END)::BIGINT AS soft_404,
350
+ SUM(CASE WHEN CAST(pageFetchState AS VARCHAR) = 'REDIRECT_ERROR' THEN 1 ELSE 0 END)::BIGINT AS redirect,
351
+ SUM(CASE WHEN CAST(pageFetchState AS VARCHAR) = 'NOT_FOUND' THEN 1 ELSE 0 END)::BIGINT AS not_found,
352
+ SUM(CASE WHEN CAST(mobileUsabilityVerdict AS VARCHAR) = 'PASS' THEN 1 ELSE 0 END)::BIGINT AS mobile_passes,
353
+ SUM(CASE WHEN CAST(richResultsVerdict AS VARCHAR) = 'PASS' THEN 1 ELSE 0 END)::BIGINT AS rich_results_passes,
354
+ SUM(CASE WHEN userCanonical IS NOT NULL AND googleCanonical IS NOT NULL AND CAST(userCanonical AS VARCHAR) <> CAST(googleCanonical AS VARCHAR) THEN 1 ELSE 0 END)::BIGINT AS canonical_mismatches
355
355
  FROM read_parquet({{INSPECTIONS}}, union_by_name = true)
356
- WHERE substr(inspectedAt, 1, 10) >= '${utcDateMinusDays(builtAt, 90)}'
356
+ WHERE substr(CAST(inspectedAt AS VARCHAR), 1, 10) >= '${utcDateMinusDays(builtAt, 90)}'
357
357
  GROUP BY 1
358
358
  ORDER BY 1
359
359
  `;
@@ -2,6 +2,7 @@ import { h as resolveToSQL, n as pgResolverAdapter, s as assertDimensionsSupport
2
2
  import { i as getFilterDimensions } from "../_chunks/resolver.mjs";
3
3
  import { n as runAnalyzerFromSource } from "../_chunks/dispatch.mjs";
4
4
  var AttachedTableMissingError = class extends Error {
5
+ missing;
5
6
  constructor(missing) {
6
7
  super(`attached-table source: required table(s) not attached: ${missing.join(", ")}`);
7
8
  this.missing = missing;
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@gscdump/engine",
3
3
  "type": "module",
4
- "version": "0.11.1",
4
+ "version": "0.11.3",
5
5
  "description": "Append-only Parquet/DuckDB storage engine + planner + adapters for the gscdump pipeline. Node + edge runtimes; opt-in heavy peers.",
6
6
  "author": {
7
7
  "name": "Harlan Wilton",
@@ -152,8 +152,8 @@
152
152
  },
153
153
  "peerDependencies": {
154
154
  "@duckdb/duckdb-wasm": "^1.32.0",
155
- "hyparquet": "^1.25.6",
156
- "hyparquet-writer": "^0.14.0"
155
+ "hyparquet": "^1.25.8",
156
+ "hyparquet-writer": "^0.15.1"
157
157
  },
158
158
  "peerDependenciesMeta": {
159
159
  "@duckdb/duckdb-wasm": {
@@ -169,14 +169,14 @@
169
169
  "dependencies": {
170
170
  "drizzle-orm": "^0.45.2",
171
171
  "proper-lockfile": "^4.1.2",
172
- "gscdump": "0.11.1"
172
+ "gscdump": "0.11.3"
173
173
  },
174
174
  "devDependencies": {
175
175
  "@duckdb/duckdb-wasm": "^1.32.0",
176
176
  "@types/proper-lockfile": "^4.1.4",
177
177
  "aws4fetch": "^1.0.20",
178
- "hyparquet": "^1.25.6",
179
- "hyparquet-writer": "^0.14.0",
178
+ "hyparquet": "^1.25.8",
179
+ "hyparquet-writer": "^0.15.1",
180
180
  "tsx": "^4.21.0",
181
181
  "vitest": "^4.1.5"
182
182
  },