notro-loader 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -21
- package/README.md +276 -276
- package/image-service.ts +45 -45
- package/index.ts +21 -21
- package/integration.ts +13 -13
- package/package.json +4 -3
- package/src/components/NotroContent.astro +37 -37
- package/src/loader/live-loader.ts +181 -181
- package/src/loader/loader.ts +268 -268
- package/src/loader/schema.ts +837 -837
- package/src/utils/HtmlElements.ts +27 -27
- package/src/utils/compile-mdx.ts +159 -159
- package/src/utils/default-components.ts +62 -62
- package/src/utils/mdx-pipeline.ts +501 -501
- package/src/utils/notion-url.ts +49 -49
- package/src/utils/notion.ts +127 -127
- package/src/utils/notro-config.ts +35 -35
- package/utils.ts +11 -11
package/src/loader/loader.ts
CHANGED
|
@@ -1,268 +1,268 @@
|
|
|
1
|
-
import type { Loader } from "astro/loaders";
|
|
2
|
-
import {
|
|
3
|
-
Client,
|
|
4
|
-
isFullPage,
|
|
5
|
-
iteratePaginatedAPI,
|
|
6
|
-
APIErrorCode,
|
|
7
|
-
APIResponseError,
|
|
8
|
-
} from "@notionhq/client";
|
|
9
|
-
import type { QueryDataSourceParameters } from "@notionhq/client";
|
|
10
|
-
import {
|
|
11
|
-
type PageWithMarkdownType,
|
|
12
|
-
pageWithMarkdownSchema,
|
|
13
|
-
} from "./schema.ts";
|
|
14
|
-
import { markdownHasPresignedUrls } from "../utils/notion-url.ts";
|
|
15
|
-
|
|
16
|
-
type LoaderOptions = {
|
|
17
|
-
queryParameters: QueryDataSourceParameters;
|
|
18
|
-
// Derive from Client constructor to avoid importing from internal paths
|
|
19
|
-
clientOptions: ConstructorParameters<typeof Client>[0];
|
|
20
|
-
};
|
|
21
|
-
|
|
22
|
-
// Notion file-type covers, icons, and inline images use pre-signed S3 URLs that expire after ~1 hour.
|
|
23
|
-
// If any are present in a cached entry, it must be re-fetched to get fresh URLs.
|
|
24
|
-
function hasNotionPresignedUrl(data: PageWithMarkdownType): boolean {
|
|
25
|
-
if (data.cover?.type === "file") return true;
|
|
26
|
-
if (data.icon?.type === "file") return true;
|
|
27
|
-
return markdownHasPresignedUrls(data.markdown);
|
|
28
|
-
}
|
|
29
|
-
|
|
30
|
-
// Error codes that are safe to retry (rate limit, server errors).
|
|
31
|
-
const RETRYABLE_API_ERROR_CODES: ReadonlySet<string> = new Set([
|
|
32
|
-
APIErrorCode.RateLimited,
|
|
33
|
-
APIErrorCode.InternalServerError,
|
|
34
|
-
APIErrorCode.ServiceUnavailable,
|
|
35
|
-
]);
|
|
36
|
-
|
|
37
|
-
// Retry delays in milliseconds for each attempt (exponential backoff: 1s, 2s, 4s).
|
|
38
|
-
const RETRY_DELAYS_MS = [1000, 2000, 4000];
|
|
39
|
-
|
|
40
|
-
/**
|
|
41
|
-
* Generic retry wrapper for Notion API calls.
|
|
42
|
-
* - 429 (RateLimited), 500 (InternalServerError), 503 (ServiceUnavailable): retry up to 3 times
|
|
43
|
-
* with exponential backoff (1s / 2s / 4s).
|
|
44
|
-
* - Other errors (401, 403, 404, etc.): re-thrown immediately.
|
|
45
|
-
*
|
|
46
|
-
* @param fn - Async function to call (and retry on transient errors).
|
|
47
|
-
* @param label - Human-readable label for warning messages (e.g. "Page <id>").
|
|
48
|
-
* @param logger - Logger with a warn method.
|
|
49
|
-
*/
|
|
50
|
-
async function withRetry<T>(
|
|
51
|
-
fn: () => Promise<T>,
|
|
52
|
-
label: string,
|
|
53
|
-
logger: { warn: (msg: string) => void },
|
|
54
|
-
): Promise<T> {
|
|
55
|
-
let lastError: unknown;
|
|
56
|
-
|
|
57
|
-
for (let attempt = 0; attempt <= RETRY_DELAYS_MS.length; attempt++) {
|
|
58
|
-
try {
|
|
59
|
-
return await fn();
|
|
60
|
-
} catch (error) {
|
|
61
|
-
lastError = error;
|
|
62
|
-
|
|
63
|
-
const isRetryable =
|
|
64
|
-
error instanceof APIResponseError &&
|
|
65
|
-
RETRYABLE_API_ERROR_CODES.has(error.code);
|
|
66
|
-
|
|
67
|
-
if (!isRetryable || attempt === RETRY_DELAYS_MS.length) {
|
|
68
|
-
throw error;
|
|
69
|
-
}
|
|
70
|
-
|
|
71
|
-
const delayMs = RETRY_DELAYS_MS[attempt];
|
|
72
|
-
logger.warn(
|
|
73
|
-
`${label}: API error "${(error as APIResponseError).code}" (attempt ${attempt + 1}/${RETRY_DELAYS_MS.length + 1}), retrying in ${delayMs}ms...`,
|
|
74
|
-
);
|
|
75
|
-
await new Promise((resolve) => setTimeout(resolve, delayMs));
|
|
76
|
-
}
|
|
77
|
-
}
|
|
78
|
-
|
|
79
|
-
// Unreachable, but required for TypeScript exhaustiveness.
|
|
80
|
-
throw lastError;
|
|
81
|
-
}
|
|
82
|
-
|
|
83
|
-
/**
|
|
84
|
-
* Calls iteratePaginatedAPI(client.dataSources.query, ...) with retry logic for transient errors.
|
|
85
|
-
* - 429 (RateLimited), 500 (InternalServerError), 503 (ServiceUnavailable): retry up to 3 times
|
|
86
|
-
* with exponential backoff (1s / 2s / 4s).
|
|
87
|
-
* - Other errors (401, 403, 404, etc.): re-thrown immediately.
|
|
88
|
-
*/
|
|
89
|
-
async function queryDataSourceWithRetry(
|
|
90
|
-
client: Client,
|
|
91
|
-
queryParameters: QueryDataSourceParameters,
|
|
92
|
-
logger: { warn: (msg: string) => void },
|
|
93
|
-
): Promise<Awaited<ReturnType<typeof iteratePaginatedAPI<typeof client.dataSources.query>>>[]> {
|
|
94
|
-
return withRetry(
|
|
95
|
-
() => Array.fromAsync(iteratePaginatedAPI(client.dataSources.query, queryParameters)),
|
|
96
|
-
"dataSources.query",
|
|
97
|
-
logger,
|
|
98
|
-
);
|
|
99
|
-
}
|
|
100
|
-
|
|
101
|
-
/**
|
|
102
|
-
* Calls client.pages.retrieveMarkdown with retry logic for transient errors.
|
|
103
|
-
* - 429 (RateLimited), 500 (InternalServerError), 503 (ServiceUnavailable): retry up to 3 times
|
|
104
|
-
* with exponential backoff (1s / 2s / 4s).
|
|
105
|
-
* - Other errors: re-thrown immediately.
|
|
106
|
-
*/
|
|
107
|
-
async function retrieveMarkdownWithRetry(
|
|
108
|
-
client: Client,
|
|
109
|
-
pageId: string,
|
|
110
|
-
logger: { warn: (msg: string) => void },
|
|
111
|
-
): Promise<Awaited<ReturnType<typeof client.pages.retrieveMarkdown>>> {
|
|
112
|
-
return withRetry(
|
|
113
|
-
() => client.pages.retrieveMarkdown({ page_id: pageId }),
|
|
114
|
-
`Page ${pageId}`,
|
|
115
|
-
logger,
|
|
116
|
-
);
|
|
117
|
-
}
|
|
118
|
-
|
|
119
|
-
// Define any options that the loader needs
|
|
120
|
-
export function loader({
|
|
121
|
-
queryParameters,
|
|
122
|
-
clientOptions,
|
|
123
|
-
}: LoaderOptions): Loader {
|
|
124
|
-
const client = new Client({ notionVersion: "2026-03-11", ...clientOptions });
|
|
125
|
-
|
|
126
|
-
// Return a loader object
|
|
127
|
-
return {
|
|
128
|
-
name: "notro-loader",
|
|
129
|
-
load: async ({ store, parseData, logger }): Promise<void> => {
|
|
130
|
-
// Load data and update the store.
|
|
131
|
-
// Uses retry logic for transient API errors (rate limit, server errors).
|
|
132
|
-
const pageOrDatabases = await queryDataSourceWithRetry(
|
|
133
|
-
client,
|
|
134
|
-
queryParameters,
|
|
135
|
-
logger,
|
|
136
|
-
);
|
|
137
|
-
|
|
138
|
-
const pages = pageOrDatabases.filter((page) => isFullPage(page));
|
|
139
|
-
|
|
140
|
-
// Build a lookup map for O(1) access when checking store entries
|
|
141
|
-
const pageById = new Map(pages.map((page) => [page.id, page]));
|
|
142
|
-
|
|
143
|
-
// Delete entries that are removed, edited, or contain expired pre-signed URLs
|
|
144
|
-
store.entries().forEach(([id, { digest, data }]) => {
|
|
145
|
-
const page = pageById.get(id);
|
|
146
|
-
const isDeleted = page === undefined;
|
|
147
|
-
const isEdited = page !== undefined && digest !== page.last_edited_time;
|
|
148
|
-
const hasExpiredUrls = hasNotionPresignedUrl(
|
|
149
|
-
data as PageWithMarkdownType,
|
|
150
|
-
);
|
|
151
|
-
if (isDeleted || isEdited || hasExpiredUrls) {
|
|
152
|
-
logger.info(`Deleting page ${id} from store`);
|
|
153
|
-
store.delete(id);
|
|
154
|
-
}
|
|
155
|
-
});
|
|
156
|
-
|
|
157
|
-
// Load new or updated pages, respecting Notion's 3 requests/second rate limit.
|
|
158
|
-
// Pages are processed in batches of 3 with a 1-second pause between batches.
|
|
159
|
-
const pagesToLoad = pages.filter((page) => !store.has(page.id));
|
|
160
|
-
const BATCH_SIZE = 3;
|
|
161
|
-
|
|
162
|
-
for (let i = 0; i < pagesToLoad.length; i += BATCH_SIZE) {
|
|
163
|
-
const batch = pagesToLoad.slice(i, i + BATCH_SIZE);
|
|
164
|
-
|
|
165
|
-
await Promise.all(
|
|
166
|
-
batch.map(async (page) => {
|
|
167
|
-
logger.info(`Loading page ${page.id} into store`);
|
|
168
|
-
|
|
169
|
-
let markdownResponse: Awaited<
|
|
170
|
-
ReturnType<typeof client.pages.retrieveMarkdown>
|
|
171
|
-
>;
|
|
172
|
-
|
|
173
|
-
try {
|
|
174
|
-
markdownResponse = await retrieveMarkdownWithRetry(
|
|
175
|
-
client,
|
|
176
|
-
page.id,
|
|
177
|
-
logger,
|
|
178
|
-
);
|
|
179
|
-
} catch (error) {
|
|
180
|
-
// Skip this page rather than aborting the entire build.
|
|
181
|
-
if (error instanceof APIResponseError) {
|
|
182
|
-
logger.warn(
|
|
183
|
-
`Page ${page.id}: failed to retrieve markdown (${error.code}, status ${error.status}). Skipping.`,
|
|
184
|
-
);
|
|
185
|
-
} else {
|
|
186
|
-
logger.warn(
|
|
187
|
-
`Page ${page.id}: unexpected error while retrieving markdown: ${String(error)}. Skipping.`,
|
|
188
|
-
);
|
|
189
|
-
}
|
|
190
|
-
return;
|
|
191
|
-
}
|
|
192
|
-
|
|
193
|
-
if (markdownResponse.truncated) {
|
|
194
|
-
// The Notion pages.retrieveMarkdown API truncates content at ~20,000 blocks.
|
|
195
|
-
// There is no cursor/pagination parameter to retrieve the remaining content.
|
|
196
|
-
// The page will be loaded with the truncated content only.
|
|
197
|
-
// To avoid truncation, split large Notion pages into smaller sub-pages.
|
|
198
|
-
logger.warn(
|
|
199
|
-
`Page ${page.id}: markdown content was truncated by the Notion API ` +
|
|
200
|
-
`(~20,000 block limit). No pagination is available for this endpoint. ` +
|
|
201
|
-
`Consider splitting this Notion page into smaller pages to avoid truncation.`,
|
|
202
|
-
);
|
|
203
|
-
}
|
|
204
|
-
|
|
205
|
-
if (markdownResponse.unknown_block_ids.length > 0) {
|
|
206
|
-
// unknown_block_ids contains IDs of blocks that could not be converted to
|
|
207
|
-
// Markdown by the Notion API (e.g. unsupported or unrenderable block types).
|
|
208
|
-
// These blocks are silently omitted from the markdown output.
|
|
209
|
-
// There is no way to retrieve their content via this API endpoint.
|
|
210
|
-
logger.warn(
|
|
211
|
-
`Page ${page.id}: ${markdownResponse.unknown_block_ids.length} block(s) could not be rendered to Markdown by the Notion API and were omitted. ` +
|
|
212
|
-
`Block IDs: ${markdownResponse.unknown_block_ids.join(", ")}`,
|
|
213
|
-
);
|
|
214
|
-
}
|
|
215
|
-
|
|
216
|
-
// Store raw markdown from the Notion API.
|
|
217
|
-
// remarkNfm in the MDX compile pipeline (compile-mdx.ts) runs
|
|
218
|
-
// preprocessNotionMarkdown() at parse time, so preprocessing
|
|
219
|
-
// does not need to happen here.
|
|
220
|
-
const rawMarkdown = markdownResponse.markdown;
|
|
221
|
-
|
|
222
|
-
const data = await parseData<PageWithMarkdownType>({
|
|
223
|
-
id: page.id,
|
|
224
|
-
data: {
|
|
225
|
-
parent: page.parent,
|
|
226
|
-
properties: page.properties,
|
|
227
|
-
icon: page.icon,
|
|
228
|
-
cover: page.cover,
|
|
229
|
-
created_by: page.created_by,
|
|
230
|
-
last_edited_by: page.last_edited_by,
|
|
231
|
-
object: page.object,
|
|
232
|
-
id: page.id,
|
|
233
|
-
created_time: page.created_time,
|
|
234
|
-
last_edited_time: page.last_edited_time,
|
|
235
|
-
archived: page.archived,
|
|
236
|
-
in_trash: page.in_trash,
|
|
237
|
-
url: page.url,
|
|
238
|
-
public_url: page.public_url,
|
|
239
|
-
markdown: rawMarkdown,
|
|
240
|
-
truncated: markdownResponse.truncated,
|
|
241
|
-
} as PageWithMarkdownType,
|
|
242
|
-
});
|
|
243
|
-
|
|
244
|
-
store.set({
|
|
245
|
-
id: page.id,
|
|
246
|
-
// digest is used by the loader to detect edits between builds.
|
|
247
|
-
// We use last_edited_time as a stable, string-comparable digest.
|
|
248
|
-
digest: page.last_edited_time,
|
|
249
|
-
data: data,
|
|
250
|
-
// body is the raw text exposed by Astro's Content Layer API for
|
|
251
|
-
// full-text search integrations. It is separate from data.markdown
|
|
252
|
-
// (which is also the raw markdown) because Astro's store.set()
|
|
253
|
-
// requires body to be a top-level field distinct from the schema data.
|
|
254
|
-
body: rawMarkdown,
|
|
255
|
-
});
|
|
256
|
-
}),
|
|
257
|
-
);
|
|
258
|
-
|
|
259
|
-
// Wait 1 second between batches to stay within the 3 req/s rate limit
|
|
260
|
-
if (i + BATCH_SIZE < pagesToLoad.length) {
|
|
261
|
-
await new Promise((resolve) => setTimeout(resolve, 1000));
|
|
262
|
-
}
|
|
263
|
-
}
|
|
264
|
-
},
|
|
265
|
-
// It will be overridden by user-defined schema.
|
|
266
|
-
schema: pageWithMarkdownSchema,
|
|
267
|
-
};
|
|
268
|
-
}
|
|
1
|
+
import type { Loader } from "astro/loaders";
|
|
2
|
+
import {
|
|
3
|
+
Client,
|
|
4
|
+
isFullPage,
|
|
5
|
+
iteratePaginatedAPI,
|
|
6
|
+
APIErrorCode,
|
|
7
|
+
APIResponseError,
|
|
8
|
+
} from "@notionhq/client";
|
|
9
|
+
import type { QueryDataSourceParameters } from "@notionhq/client";
|
|
10
|
+
import {
|
|
11
|
+
type PageWithMarkdownType,
|
|
12
|
+
pageWithMarkdownSchema,
|
|
13
|
+
} from "./schema.ts";
|
|
14
|
+
import { markdownHasPresignedUrls } from "../utils/notion-url.ts";
|
|
15
|
+
|
|
16
|
+
type LoaderOptions = {
|
|
17
|
+
queryParameters: QueryDataSourceParameters;
|
|
18
|
+
// Derive from Client constructor to avoid importing from internal paths
|
|
19
|
+
clientOptions: ConstructorParameters<typeof Client>[0];
|
|
20
|
+
};
|
|
21
|
+
|
|
22
|
+
// Notion file-type covers, icons, and inline images use pre-signed S3 URLs that expire after ~1 hour.
|
|
23
|
+
// If any are present in a cached entry, it must be re-fetched to get fresh URLs.
|
|
24
|
+
function hasNotionPresignedUrl(data: PageWithMarkdownType): boolean {
|
|
25
|
+
if (data.cover?.type === "file") return true;
|
|
26
|
+
if (data.icon?.type === "file") return true;
|
|
27
|
+
return markdownHasPresignedUrls(data.markdown);
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
// Error codes that are safe to retry (rate limit, server errors).
|
|
31
|
+
const RETRYABLE_API_ERROR_CODES: ReadonlySet<string> = new Set([
|
|
32
|
+
APIErrorCode.RateLimited,
|
|
33
|
+
APIErrorCode.InternalServerError,
|
|
34
|
+
APIErrorCode.ServiceUnavailable,
|
|
35
|
+
]);
|
|
36
|
+
|
|
37
|
+
// Retry delays in milliseconds for each attempt (exponential backoff: 1s, 2s, 4s).
|
|
38
|
+
const RETRY_DELAYS_MS = [1000, 2000, 4000];
|
|
39
|
+
|
|
40
|
+
/**
|
|
41
|
+
* Generic retry wrapper for Notion API calls.
|
|
42
|
+
* - 429 (RateLimited), 500 (InternalServerError), 503 (ServiceUnavailable): retry up to 3 times
|
|
43
|
+
* with exponential backoff (1s / 2s / 4s).
|
|
44
|
+
* - Other errors (401, 403, 404, etc.): re-thrown immediately.
|
|
45
|
+
*
|
|
46
|
+
* @param fn - Async function to call (and retry on transient errors).
|
|
47
|
+
* @param label - Human-readable label for warning messages (e.g. "Page <id>").
|
|
48
|
+
* @param logger - Logger with a warn method.
|
|
49
|
+
*/
|
|
50
|
+
async function withRetry<T>(
|
|
51
|
+
fn: () => Promise<T>,
|
|
52
|
+
label: string,
|
|
53
|
+
logger: { warn: (msg: string) => void },
|
|
54
|
+
): Promise<T> {
|
|
55
|
+
let lastError: unknown;
|
|
56
|
+
|
|
57
|
+
for (let attempt = 0; attempt <= RETRY_DELAYS_MS.length; attempt++) {
|
|
58
|
+
try {
|
|
59
|
+
return await fn();
|
|
60
|
+
} catch (error) {
|
|
61
|
+
lastError = error;
|
|
62
|
+
|
|
63
|
+
const isRetryable =
|
|
64
|
+
error instanceof APIResponseError &&
|
|
65
|
+
RETRYABLE_API_ERROR_CODES.has(error.code);
|
|
66
|
+
|
|
67
|
+
if (!isRetryable || attempt === RETRY_DELAYS_MS.length) {
|
|
68
|
+
throw error;
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
const delayMs = RETRY_DELAYS_MS[attempt];
|
|
72
|
+
logger.warn(
|
|
73
|
+
`${label}: API error "${(error as APIResponseError).code}" (attempt ${attempt + 1}/${RETRY_DELAYS_MS.length + 1}), retrying in ${delayMs}ms...`,
|
|
74
|
+
);
|
|
75
|
+
await new Promise((resolve) => setTimeout(resolve, delayMs));
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
// Unreachable, but required for TypeScript exhaustiveness.
|
|
80
|
+
throw lastError;
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
/**
|
|
84
|
+
* Calls iteratePaginatedAPI(client.dataSources.query, ...) with retry logic for transient errors.
|
|
85
|
+
* - 429 (RateLimited), 500 (InternalServerError), 503 (ServiceUnavailable): retry up to 3 times
|
|
86
|
+
* with exponential backoff (1s / 2s / 4s).
|
|
87
|
+
* - Other errors (401, 403, 404, etc.): re-thrown immediately.
|
|
88
|
+
*/
|
|
89
|
+
async function queryDataSourceWithRetry(
|
|
90
|
+
client: Client,
|
|
91
|
+
queryParameters: QueryDataSourceParameters,
|
|
92
|
+
logger: { warn: (msg: string) => void },
|
|
93
|
+
): Promise<Awaited<ReturnType<typeof iteratePaginatedAPI<typeof client.dataSources.query>>>[]> {
|
|
94
|
+
return withRetry(
|
|
95
|
+
() => Array.fromAsync(iteratePaginatedAPI(client.dataSources.query, queryParameters)),
|
|
96
|
+
"dataSources.query",
|
|
97
|
+
logger,
|
|
98
|
+
);
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
/**
|
|
102
|
+
* Calls client.pages.retrieveMarkdown with retry logic for transient errors.
|
|
103
|
+
* - 429 (RateLimited), 500 (InternalServerError), 503 (ServiceUnavailable): retry up to 3 times
|
|
104
|
+
* with exponential backoff (1s / 2s / 4s).
|
|
105
|
+
* - Other errors: re-thrown immediately.
|
|
106
|
+
*/
|
|
107
|
+
async function retrieveMarkdownWithRetry(
|
|
108
|
+
client: Client,
|
|
109
|
+
pageId: string,
|
|
110
|
+
logger: { warn: (msg: string) => void },
|
|
111
|
+
): Promise<Awaited<ReturnType<typeof client.pages.retrieveMarkdown>>> {
|
|
112
|
+
return withRetry(
|
|
113
|
+
() => client.pages.retrieveMarkdown({ page_id: pageId }),
|
|
114
|
+
`Page ${pageId}`,
|
|
115
|
+
logger,
|
|
116
|
+
);
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
// Define any options that the loader needs
|
|
120
|
+
export function loader({
|
|
121
|
+
queryParameters,
|
|
122
|
+
clientOptions,
|
|
123
|
+
}: LoaderOptions): Loader {
|
|
124
|
+
const client = new Client({ notionVersion: "2026-03-11", ...clientOptions });
|
|
125
|
+
|
|
126
|
+
// Return a loader object
|
|
127
|
+
return {
|
|
128
|
+
name: "notro-loader",
|
|
129
|
+
load: async ({ store, parseData, logger }): Promise<void> => {
|
|
130
|
+
// Load data and update the store.
|
|
131
|
+
// Uses retry logic for transient API errors (rate limit, server errors).
|
|
132
|
+
const pageOrDatabases = await queryDataSourceWithRetry(
|
|
133
|
+
client,
|
|
134
|
+
queryParameters,
|
|
135
|
+
logger,
|
|
136
|
+
);
|
|
137
|
+
|
|
138
|
+
const pages = pageOrDatabases.filter((page) => isFullPage(page));
|
|
139
|
+
|
|
140
|
+
// Build a lookup map for O(1) access when checking store entries
|
|
141
|
+
const pageById = new Map(pages.map((page) => [page.id, page]));
|
|
142
|
+
|
|
143
|
+
// Delete entries that are removed, edited, or contain expired pre-signed URLs
|
|
144
|
+
store.entries().forEach(([id, { digest, data }]) => {
|
|
145
|
+
const page = pageById.get(id);
|
|
146
|
+
const isDeleted = page === undefined;
|
|
147
|
+
const isEdited = page !== undefined && digest !== page.last_edited_time;
|
|
148
|
+
const hasExpiredUrls = hasNotionPresignedUrl(
|
|
149
|
+
data as PageWithMarkdownType,
|
|
150
|
+
);
|
|
151
|
+
if (isDeleted || isEdited || hasExpiredUrls) {
|
|
152
|
+
logger.info(`Deleting page ${id} from store`);
|
|
153
|
+
store.delete(id);
|
|
154
|
+
}
|
|
155
|
+
});
|
|
156
|
+
|
|
157
|
+
// Load new or updated pages, respecting Notion's 3 requests/second rate limit.
|
|
158
|
+
// Pages are processed in batches of 3 with a 1-second pause between batches.
|
|
159
|
+
const pagesToLoad = pages.filter((page) => !store.has(page.id));
|
|
160
|
+
const BATCH_SIZE = 3;
|
|
161
|
+
|
|
162
|
+
for (let i = 0; i < pagesToLoad.length; i += BATCH_SIZE) {
|
|
163
|
+
const batch = pagesToLoad.slice(i, i + BATCH_SIZE);
|
|
164
|
+
|
|
165
|
+
await Promise.all(
|
|
166
|
+
batch.map(async (page) => {
|
|
167
|
+
logger.info(`Loading page ${page.id} into store`);
|
|
168
|
+
|
|
169
|
+
let markdownResponse: Awaited<
|
|
170
|
+
ReturnType<typeof client.pages.retrieveMarkdown>
|
|
171
|
+
>;
|
|
172
|
+
|
|
173
|
+
try {
|
|
174
|
+
markdownResponse = await retrieveMarkdownWithRetry(
|
|
175
|
+
client,
|
|
176
|
+
page.id,
|
|
177
|
+
logger,
|
|
178
|
+
);
|
|
179
|
+
} catch (error) {
|
|
180
|
+
// Skip this page rather than aborting the entire build.
|
|
181
|
+
if (error instanceof APIResponseError) {
|
|
182
|
+
logger.warn(
|
|
183
|
+
`Page ${page.id}: failed to retrieve markdown (${error.code}, status ${error.status}). Skipping.`,
|
|
184
|
+
);
|
|
185
|
+
} else {
|
|
186
|
+
logger.warn(
|
|
187
|
+
`Page ${page.id}: unexpected error while retrieving markdown: ${String(error)}. Skipping.`,
|
|
188
|
+
);
|
|
189
|
+
}
|
|
190
|
+
return;
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
if (markdownResponse.truncated) {
|
|
194
|
+
// The Notion pages.retrieveMarkdown API truncates content at ~20,000 blocks.
|
|
195
|
+
// There is no cursor/pagination parameter to retrieve the remaining content.
|
|
196
|
+
// The page will be loaded with the truncated content only.
|
|
197
|
+
// To avoid truncation, split large Notion pages into smaller sub-pages.
|
|
198
|
+
logger.warn(
|
|
199
|
+
`Page ${page.id}: markdown content was truncated by the Notion API ` +
|
|
200
|
+
`(~20,000 block limit). No pagination is available for this endpoint. ` +
|
|
201
|
+
`Consider splitting this Notion page into smaller pages to avoid truncation.`,
|
|
202
|
+
);
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
if (markdownResponse.unknown_block_ids.length > 0) {
|
|
206
|
+
// unknown_block_ids contains IDs of blocks that could not be converted to
|
|
207
|
+
// Markdown by the Notion API (e.g. unsupported or unrenderable block types).
|
|
208
|
+
// These blocks are silently omitted from the markdown output.
|
|
209
|
+
// There is no way to retrieve their content via this API endpoint.
|
|
210
|
+
logger.warn(
|
|
211
|
+
`Page ${page.id}: ${markdownResponse.unknown_block_ids.length} block(s) could not be rendered to Markdown by the Notion API and were omitted. ` +
|
|
212
|
+
`Block IDs: ${markdownResponse.unknown_block_ids.join(", ")}`,
|
|
213
|
+
);
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
// Store raw markdown from the Notion API.
|
|
217
|
+
// remarkNfm in the MDX compile pipeline (compile-mdx.ts) runs
|
|
218
|
+
// preprocessNotionMarkdown() at parse time, so preprocessing
|
|
219
|
+
// does not need to happen here.
|
|
220
|
+
const rawMarkdown = markdownResponse.markdown;
|
|
221
|
+
|
|
222
|
+
const data = await parseData<PageWithMarkdownType>({
|
|
223
|
+
id: page.id,
|
|
224
|
+
data: {
|
|
225
|
+
parent: page.parent,
|
|
226
|
+
properties: page.properties,
|
|
227
|
+
icon: page.icon,
|
|
228
|
+
cover: page.cover,
|
|
229
|
+
created_by: page.created_by,
|
|
230
|
+
last_edited_by: page.last_edited_by,
|
|
231
|
+
object: page.object,
|
|
232
|
+
id: page.id,
|
|
233
|
+
created_time: page.created_time,
|
|
234
|
+
last_edited_time: page.last_edited_time,
|
|
235
|
+
archived: page.archived,
|
|
236
|
+
in_trash: page.in_trash,
|
|
237
|
+
url: page.url,
|
|
238
|
+
public_url: page.public_url,
|
|
239
|
+
markdown: rawMarkdown,
|
|
240
|
+
truncated: markdownResponse.truncated,
|
|
241
|
+
} as PageWithMarkdownType,
|
|
242
|
+
});
|
|
243
|
+
|
|
244
|
+
store.set({
|
|
245
|
+
id: page.id,
|
|
246
|
+
// digest is used by the loader to detect edits between builds.
|
|
247
|
+
// We use last_edited_time as a stable, string-comparable digest.
|
|
248
|
+
digest: page.last_edited_time,
|
|
249
|
+
data: data,
|
|
250
|
+
// body is the raw text exposed by Astro's Content Layer API for
|
|
251
|
+
// full-text search integrations. It is separate from data.markdown
|
|
252
|
+
// (which is also the raw markdown) because Astro's store.set()
|
|
253
|
+
// requires body to be a top-level field distinct from the schema data.
|
|
254
|
+
body: rawMarkdown,
|
|
255
|
+
});
|
|
256
|
+
}),
|
|
257
|
+
);
|
|
258
|
+
|
|
259
|
+
// Wait 1 second between batches to stay within the 3 req/s rate limit
|
|
260
|
+
if (i + BATCH_SIZE < pagesToLoad.length) {
|
|
261
|
+
await new Promise((resolve) => setTimeout(resolve, 1000));
|
|
262
|
+
}
|
|
263
|
+
}
|
|
264
|
+
},
|
|
265
|
+
// It will be overridden by user-defined schema.
|
|
266
|
+
schema: pageWithMarkdownSchema,
|
|
267
|
+
};
|
|
268
|
+
}
|