folderblog 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-24MKFHML.cjs → chunk-2TZSVPNP.cjs} +5 -0
- package/dist/{chunk-HMQIQUPB.cjs → chunk-6TFXNIO6.cjs} +108 -0
- package/dist/{chunk-ZRUBI3GH.js → chunk-B43UAOPC.js} +106 -1
- package/dist/{chunk-XP5J4LFJ.js → chunk-D26H5722.js} +5 -0
- package/dist/chunk-E7PYGJA7.cjs +39 -0
- package/dist/{chunk-QA4KPPTA.cjs → chunk-J3Y3HEBF.cjs} +84 -13
- package/dist/{chunk-PARGDJNY.js → chunk-K76XLEC7.js} +1 -1
- package/dist/{chunk-IXP35S24.js → chunk-LPPBVXJ7.js} +83 -12
- package/dist/chunk-Q6EXKX6K.js +17 -0
- package/dist/{chunk-4ZJGUMHS.cjs → chunk-Q6EYTOTM.cjs} +2 -2
- package/dist/chunk-UCXXH2MP.cjs +20 -0
- package/dist/chunk-XQD3UUL5.js +34 -0
- package/dist/cli/bin.cjs +5 -5
- package/dist/cli/bin.js +4 -4
- package/dist/cli/index.cjs +5 -5
- package/dist/cli/index.js +4 -4
- package/dist/config-ADPY6IQS.d.cts +473 -0
- package/dist/config-Dctsdeo6.d.ts +473 -0
- package/dist/index.cjs +157 -187
- package/dist/index.d.cts +4 -3
- package/dist/index.d.ts +4 -3
- package/dist/index.js +16 -69
- package/dist/local/index.cjs +785 -0
- package/dist/local/index.d.cts +268 -0
- package/dist/local/index.d.ts +268 -0
- package/dist/local/index.js +772 -0
- package/dist/output-0P0br3Jc.d.cts +452 -0
- package/dist/output-0P0br3Jc.d.ts +452 -0
- package/dist/plugins/embed-cloudflare-ai.cjs +166 -0
- package/dist/plugins/embed-cloudflare-ai.d.cts +73 -0
- package/dist/plugins/embed-cloudflare-ai.d.ts +73 -0
- package/dist/plugins/embed-cloudflare-ai.js +156 -0
- package/dist/plugins/embed-transformers.cjs +121 -0
- package/dist/plugins/embed-transformers.d.cts +55 -0
- package/dist/plugins/embed-transformers.d.ts +55 -0
- package/dist/plugins/embed-transformers.js +113 -0
- package/dist/plugins/similarity.cjs +19 -0
- package/dist/plugins/similarity.d.cts +41 -0
- package/dist/plugins/similarity.d.ts +41 -0
- package/dist/plugins/similarity.js +2 -0
- package/dist/processor/index.cjs +123 -111
- package/dist/processor/index.d.cts +6 -2
- package/dist/processor/index.d.ts +6 -2
- package/dist/processor/index.js +3 -3
- package/dist/processor/plugins.cjs +24 -12
- package/dist/processor/plugins.d.cts +4 -2
- package/dist/processor/plugins.d.ts +4 -2
- package/dist/processor/plugins.js +1 -1
- package/dist/processor/types.cjs +16 -16
- package/dist/processor/types.d.cts +3 -2
- package/dist/processor/types.d.ts +3 -2
- package/dist/processor/types.js +1 -1
- package/dist/seo/index.cjs +289 -0
- package/dist/seo/index.d.cts +95 -0
- package/dist/seo/index.d.ts +95 -0
- package/dist/seo/index.js +274 -0
- package/dist/server/index.cjs +2 -5
- package/dist/server/index.js +2 -5
- package/package.json +36 -1
- package/dist/config-DFr-htlO.d.cts +0 -887
- package/dist/config-DFr-htlO.d.ts +0 -887
|
@@ -374,14 +374,122 @@ var createAllNoOpPlugins = () => ({
|
|
|
374
374
|
database: new NoOpDatabase()
|
|
375
375
|
});
|
|
376
376
|
|
|
377
|
+
// ../processor/src/plugins/similarity.ts
|
|
378
|
+
var CosineSimilarityPlugin = class {
|
|
379
|
+
name = "similarity";
|
|
380
|
+
requires = ["textEmbedder"];
|
|
381
|
+
ready = false;
|
|
382
|
+
context = null;
|
|
383
|
+
topN;
|
|
384
|
+
threshold;
|
|
385
|
+
constructor(options = {}) {
|
|
386
|
+
this.topN = options.topN ?? 5;
|
|
387
|
+
this.threshold = options.threshold ?? 0;
|
|
388
|
+
}
|
|
389
|
+
async initialize(context) {
|
|
390
|
+
this.context = context;
|
|
391
|
+
this.ready = true;
|
|
392
|
+
context.log(
|
|
393
|
+
`CosineSimilarityPlugin initialized (topN=${this.topN}, threshold=${this.threshold})`,
|
|
394
|
+
"info"
|
|
395
|
+
);
|
|
396
|
+
}
|
|
397
|
+
isReady() {
|
|
398
|
+
return this.ready;
|
|
399
|
+
}
|
|
400
|
+
async dispose() {
|
|
401
|
+
this.ready = false;
|
|
402
|
+
}
|
|
403
|
+
computeSimilarity(a, b) {
|
|
404
|
+
return cosineSimilarity(a, b);
|
|
405
|
+
}
|
|
406
|
+
async generateSimilarityMap(posts) {
|
|
407
|
+
const postsWithEmbeddings = posts.filter(
|
|
408
|
+
(p) => p.embedding && p.embedding.length > 0
|
|
409
|
+
);
|
|
410
|
+
if (postsWithEmbeddings.length < 2) {
|
|
411
|
+
this.context?.log(
|
|
412
|
+
`Skipping similarity: only ${postsWithEmbeddings.length} posts with embeddings`,
|
|
413
|
+
"debug"
|
|
414
|
+
);
|
|
415
|
+
return {
|
|
416
|
+
pairwiseScores: /* @__PURE__ */ new Map(),
|
|
417
|
+
similarPosts: /* @__PURE__ */ new Map(),
|
|
418
|
+
metadata: {
|
|
419
|
+
computedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
420
|
+
postCount: postsWithEmbeddings.length,
|
|
421
|
+
pairCount: 0
|
|
422
|
+
}
|
|
423
|
+
};
|
|
424
|
+
}
|
|
425
|
+
const pairwiseScores = /* @__PURE__ */ new Map();
|
|
426
|
+
const similarPosts = /* @__PURE__ */ new Map();
|
|
427
|
+
let pairCount = 0;
|
|
428
|
+
for (const post of postsWithEmbeddings) {
|
|
429
|
+
const similarities = [];
|
|
430
|
+
for (const other of postsWithEmbeddings) {
|
|
431
|
+
if (other.hash === post.hash) continue;
|
|
432
|
+
const pairKey = post.hash < other.hash ? `${post.hash}-${other.hash}` : `${other.hash}-${post.hash}`;
|
|
433
|
+
let score;
|
|
434
|
+
if (pairwiseScores.has(pairKey)) {
|
|
435
|
+
score = pairwiseScores.get(pairKey);
|
|
436
|
+
} else {
|
|
437
|
+
score = cosineSimilarity(post.embedding, other.embedding);
|
|
438
|
+
pairwiseScores.set(pairKey, score);
|
|
439
|
+
pairCount++;
|
|
440
|
+
}
|
|
441
|
+
if (score >= this.threshold) {
|
|
442
|
+
similarities.push({ hash: other.hash, score });
|
|
443
|
+
}
|
|
444
|
+
}
|
|
445
|
+
similarities.sort((a, b) => b.score - a.score);
|
|
446
|
+
const topHashes = similarities.slice(0, this.topN).map((s) => s.hash);
|
|
447
|
+
similarPosts.set(post.hash, topHashes);
|
|
448
|
+
}
|
|
449
|
+
this.context?.log(
|
|
450
|
+
`Computed similarity: ${pairCount} pairs for ${postsWithEmbeddings.length} posts`,
|
|
451
|
+
"info"
|
|
452
|
+
);
|
|
453
|
+
return {
|
|
454
|
+
pairwiseScores,
|
|
455
|
+
similarPosts,
|
|
456
|
+
metadata: {
|
|
457
|
+
computedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
458
|
+
postCount: postsWithEmbeddings.length,
|
|
459
|
+
pairCount
|
|
460
|
+
}
|
|
461
|
+
};
|
|
462
|
+
}
|
|
463
|
+
};
|
|
464
|
+
function cosineSimilarity(a, b) {
|
|
465
|
+
let dot = 0;
|
|
466
|
+
let normA = 0;
|
|
467
|
+
let normB = 0;
|
|
468
|
+
for (let i = 0; i < a.length; i++) {
|
|
469
|
+
const aVal = a[i] ?? 0;
|
|
470
|
+
const bVal = b[i] ?? 0;
|
|
471
|
+
dot += aVal * bVal;
|
|
472
|
+
normA += aVal * aVal;
|
|
473
|
+
normB += bVal * bVal;
|
|
474
|
+
}
|
|
475
|
+
const magnitude = Math.sqrt(normA) * Math.sqrt(normB);
|
|
476
|
+
return magnitude === 0 ? 0 : dot / magnitude;
|
|
477
|
+
}
|
|
478
|
+
var createSimilarityPlugin = (options) => {
|
|
479
|
+
return new CosineSimilarityPlugin(options);
|
|
480
|
+
};
|
|
481
|
+
|
|
377
482
|
exports.CopyOnlyImageProcessor = CopyOnlyImageProcessor;
|
|
483
|
+
exports.CosineSimilarityPlugin = CosineSimilarityPlugin;
|
|
378
484
|
exports.NoOpDatabase = NoOpDatabase;
|
|
379
485
|
exports.NoOpImageEmbedder = NoOpImageEmbedder;
|
|
380
486
|
exports.NoOpSimilarity = NoOpSimilarity;
|
|
381
487
|
exports.NoOpTextEmbedder = NoOpTextEmbedder;
|
|
382
488
|
exports.PassthroughMermaidRenderer = PassthroughMermaidRenderer;
|
|
383
489
|
exports.PluginManager = PluginManager;
|
|
490
|
+
exports.cosineSimilarity = cosineSimilarity;
|
|
384
491
|
exports.createAllNoOpPlugins = createAllNoOpPlugins;
|
|
385
492
|
exports.createDefaultPlugins = createDefaultPlugins;
|
|
386
493
|
exports.createPluginManager = createPluginManager;
|
|
494
|
+
exports.createSimilarityPlugin = createSimilarityPlugin;
|
|
387
495
|
exports.topologicalSort = topologicalSort;
|
|
@@ -367,4 +367,109 @@ var createAllNoOpPlugins = () => ({
|
|
|
367
367
|
database: new NoOpDatabase()
|
|
368
368
|
});
|
|
369
369
|
|
|
370
|
-
|
|
370
|
+
// ../processor/src/plugins/similarity.ts
|
|
371
|
+
var CosineSimilarityPlugin = class {
|
|
372
|
+
name = "similarity";
|
|
373
|
+
requires = ["textEmbedder"];
|
|
374
|
+
ready = false;
|
|
375
|
+
context = null;
|
|
376
|
+
topN;
|
|
377
|
+
threshold;
|
|
378
|
+
constructor(options = {}) {
|
|
379
|
+
this.topN = options.topN ?? 5;
|
|
380
|
+
this.threshold = options.threshold ?? 0;
|
|
381
|
+
}
|
|
382
|
+
async initialize(context) {
|
|
383
|
+
this.context = context;
|
|
384
|
+
this.ready = true;
|
|
385
|
+
context.log(
|
|
386
|
+
`CosineSimilarityPlugin initialized (topN=${this.topN}, threshold=${this.threshold})`,
|
|
387
|
+
"info"
|
|
388
|
+
);
|
|
389
|
+
}
|
|
390
|
+
isReady() {
|
|
391
|
+
return this.ready;
|
|
392
|
+
}
|
|
393
|
+
async dispose() {
|
|
394
|
+
this.ready = false;
|
|
395
|
+
}
|
|
396
|
+
computeSimilarity(a, b) {
|
|
397
|
+
return cosineSimilarity(a, b);
|
|
398
|
+
}
|
|
399
|
+
async generateSimilarityMap(posts) {
|
|
400
|
+
const postsWithEmbeddings = posts.filter(
|
|
401
|
+
(p) => p.embedding && p.embedding.length > 0
|
|
402
|
+
);
|
|
403
|
+
if (postsWithEmbeddings.length < 2) {
|
|
404
|
+
this.context?.log(
|
|
405
|
+
`Skipping similarity: only ${postsWithEmbeddings.length} posts with embeddings`,
|
|
406
|
+
"debug"
|
|
407
|
+
);
|
|
408
|
+
return {
|
|
409
|
+
pairwiseScores: /* @__PURE__ */ new Map(),
|
|
410
|
+
similarPosts: /* @__PURE__ */ new Map(),
|
|
411
|
+
metadata: {
|
|
412
|
+
computedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
413
|
+
postCount: postsWithEmbeddings.length,
|
|
414
|
+
pairCount: 0
|
|
415
|
+
}
|
|
416
|
+
};
|
|
417
|
+
}
|
|
418
|
+
const pairwiseScores = /* @__PURE__ */ new Map();
|
|
419
|
+
const similarPosts = /* @__PURE__ */ new Map();
|
|
420
|
+
let pairCount = 0;
|
|
421
|
+
for (const post of postsWithEmbeddings) {
|
|
422
|
+
const similarities = [];
|
|
423
|
+
for (const other of postsWithEmbeddings) {
|
|
424
|
+
if (other.hash === post.hash) continue;
|
|
425
|
+
const pairKey = post.hash < other.hash ? `${post.hash}-${other.hash}` : `${other.hash}-${post.hash}`;
|
|
426
|
+
let score;
|
|
427
|
+
if (pairwiseScores.has(pairKey)) {
|
|
428
|
+
score = pairwiseScores.get(pairKey);
|
|
429
|
+
} else {
|
|
430
|
+
score = cosineSimilarity(post.embedding, other.embedding);
|
|
431
|
+
pairwiseScores.set(pairKey, score);
|
|
432
|
+
pairCount++;
|
|
433
|
+
}
|
|
434
|
+
if (score >= this.threshold) {
|
|
435
|
+
similarities.push({ hash: other.hash, score });
|
|
436
|
+
}
|
|
437
|
+
}
|
|
438
|
+
similarities.sort((a, b) => b.score - a.score);
|
|
439
|
+
const topHashes = similarities.slice(0, this.topN).map((s) => s.hash);
|
|
440
|
+
similarPosts.set(post.hash, topHashes);
|
|
441
|
+
}
|
|
442
|
+
this.context?.log(
|
|
443
|
+
`Computed similarity: ${pairCount} pairs for ${postsWithEmbeddings.length} posts`,
|
|
444
|
+
"info"
|
|
445
|
+
);
|
|
446
|
+
return {
|
|
447
|
+
pairwiseScores,
|
|
448
|
+
similarPosts,
|
|
449
|
+
metadata: {
|
|
450
|
+
computedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
451
|
+
postCount: postsWithEmbeddings.length,
|
|
452
|
+
pairCount
|
|
453
|
+
}
|
|
454
|
+
};
|
|
455
|
+
}
|
|
456
|
+
};
|
|
457
|
+
function cosineSimilarity(a, b) {
|
|
458
|
+
let dot = 0;
|
|
459
|
+
let normA = 0;
|
|
460
|
+
let normB = 0;
|
|
461
|
+
for (let i = 0; i < a.length; i++) {
|
|
462
|
+
const aVal = a[i] ?? 0;
|
|
463
|
+
const bVal = b[i] ?? 0;
|
|
464
|
+
dot += aVal * bVal;
|
|
465
|
+
normA += aVal * aVal;
|
|
466
|
+
normB += bVal * bVal;
|
|
467
|
+
}
|
|
468
|
+
const magnitude = Math.sqrt(normA) * Math.sqrt(normB);
|
|
469
|
+
return magnitude === 0 ? 0 : dot / magnitude;
|
|
470
|
+
}
|
|
471
|
+
var createSimilarityPlugin = (options) => {
|
|
472
|
+
return new CosineSimilarityPlugin(options);
|
|
473
|
+
};
|
|
474
|
+
|
|
475
|
+
export { CopyOnlyImageProcessor, CosineSimilarityPlugin, NoOpDatabase, NoOpImageEmbedder, NoOpSimilarity, NoOpTextEmbedder, PassthroughMermaidRenderer, PluginManager, cosineSimilarity, createAllNoOpPlugins, createDefaultPlugins, createPluginManager, createSimilarityPlugin, topologicalSort };
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
// src/errors.ts
|
|
4
|
+
var FolderBlogError = class extends Error {
|
|
5
|
+
/** HTTP status code if applicable */
|
|
6
|
+
status;
|
|
7
|
+
/** The URL that was requested */
|
|
8
|
+
url;
|
|
9
|
+
constructor(message, options) {
|
|
10
|
+
super(message);
|
|
11
|
+
this.name = "FolderBlogError";
|
|
12
|
+
this.status = options?.status;
|
|
13
|
+
this.url = options?.url;
|
|
14
|
+
}
|
|
15
|
+
};
|
|
16
|
+
var NotFoundError = class extends FolderBlogError {
|
|
17
|
+
constructor(resource, identifier, url) {
|
|
18
|
+
super(`${resource} not found: ${identifier}`, { status: 404, url });
|
|
19
|
+
this.name = "NotFoundError";
|
|
20
|
+
}
|
|
21
|
+
};
|
|
22
|
+
var ApiError = class extends FolderBlogError {
|
|
23
|
+
constructor(message, status, url) {
|
|
24
|
+
super(message, { status, url });
|
|
25
|
+
this.name = "ApiError";
|
|
26
|
+
}
|
|
27
|
+
};
|
|
28
|
+
var NetworkError = class extends FolderBlogError {
|
|
29
|
+
constructor(message, cause) {
|
|
30
|
+
super(message);
|
|
31
|
+
this.name = "NetworkError";
|
|
32
|
+
if (cause) this.cause = cause;
|
|
33
|
+
}
|
|
34
|
+
};
|
|
35
|
+
|
|
36
|
+
exports.ApiError = ApiError;
|
|
37
|
+
exports.FolderBlogError = FolderBlogError;
|
|
38
|
+
exports.NetworkError = NetworkError;
|
|
39
|
+
exports.NotFoundError = NotFoundError;
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
'use strict';
|
|
2
2
|
|
|
3
|
-
var
|
|
3
|
+
var chunk2TZSVPNP_cjs = require('./chunk-2TZSVPNP.cjs');
|
|
4
|
+
var chunk6TFXNIO6_cjs = require('./chunk-6TFXNIO6.cjs');
|
|
4
5
|
var chunkOBGZSXTJ_cjs = require('./chunk-OBGZSXTJ.cjs');
|
|
5
6
|
var unified = require('unified');
|
|
6
7
|
var remarkParse = require('remark-parse');
|
|
@@ -1019,7 +1020,7 @@ var Processor = class {
|
|
|
1019
1020
|
this.config = options.config;
|
|
1020
1021
|
this.issues = new IssueCollector();
|
|
1021
1022
|
this.log = options.log ?? createDefaultLogger(this.config.debug?.level ?? 1);
|
|
1022
|
-
this.pluginManager = new
|
|
1023
|
+
this.pluginManager = new chunk6TFXNIO6_cjs.PluginManager({
|
|
1023
1024
|
config: this.config,
|
|
1024
1025
|
outputDir: this.resolveOutputDir(),
|
|
1025
1026
|
issues: this.issues,
|
|
@@ -1074,6 +1075,7 @@ var Processor = class {
|
|
|
1074
1075
|
this.log(`Processing folder: ${inputDir}`, "info");
|
|
1075
1076
|
this.log(`Output directory: ${outputDir}`, "info");
|
|
1076
1077
|
await ensureDir(outputDir);
|
|
1078
|
+
const totalStart = Date.now();
|
|
1077
1079
|
const state = {
|
|
1078
1080
|
inputDir,
|
|
1079
1081
|
outputDir,
|
|
@@ -1089,18 +1091,20 @@ var Processor = class {
|
|
|
1089
1091
|
textEmbeddingCacheMisses: 0,
|
|
1090
1092
|
imageEmbeddingCacheHits: 0,
|
|
1091
1093
|
imageEmbeddingCacheMisses: 0
|
|
1092
|
-
}
|
|
1094
|
+
},
|
|
1095
|
+
report: {},
|
|
1096
|
+
phaseTiming: /* @__PURE__ */ new Map()
|
|
1093
1097
|
};
|
|
1094
1098
|
if (!this.config.media?.skip) {
|
|
1095
|
-
await this.processMedia(state);
|
|
1099
|
+
await this.timed(state, "media", () => this.processMedia(state));
|
|
1096
1100
|
} else {
|
|
1097
1101
|
this.log("Skipping media processing", "info");
|
|
1098
1102
|
}
|
|
1099
|
-
await this.processMarkdownFiles(state);
|
|
1100
|
-
await this.generateEmbeddings(state);
|
|
1101
|
-
await this.generateSimilarity(state);
|
|
1102
|
-
await this.buildDatabase(state);
|
|
1103
|
-
const outputFiles = await this.writeOutput(state);
|
|
1103
|
+
await this.timed(state, "markdown", () => this.processMarkdownFiles(state));
|
|
1104
|
+
await this.timed(state, "embeddings", () => this.generateEmbeddings(state));
|
|
1105
|
+
await this.timed(state, "similarity", () => this.generateSimilarity(state));
|
|
1106
|
+
await this.timed(state, "database", () => this.buildDatabase(state));
|
|
1107
|
+
const outputFiles = await this.timed(state, "output", () => this.writeOutput(state));
|
|
1104
1108
|
const issueReport = this.issues.generateReport();
|
|
1105
1109
|
this.log(this.issues.getSummaryString(), "info");
|
|
1106
1110
|
const hasCacheActivity = state.cacheStats.mediaCacheHits > 0 || state.cacheStats.mediaCacheMisses > 0 || state.cacheStats.textEmbeddingCacheHits > 0 || state.cacheStats.imageEmbeddingCacheHits > 0;
|
|
@@ -1110,16 +1114,39 @@ var Processor = class {
|
|
|
1110
1114
|
"info"
|
|
1111
1115
|
);
|
|
1112
1116
|
}
|
|
1117
|
+
const totalMs = Date.now() - totalStart;
|
|
1118
|
+
const phases = {};
|
|
1119
|
+
for (const [phase, ms] of state.phaseTiming) {
|
|
1120
|
+
phases[phase] = ms;
|
|
1121
|
+
}
|
|
1122
|
+
const report = {
|
|
1123
|
+
...state.report,
|
|
1124
|
+
timing: { totalMs, phases }
|
|
1125
|
+
};
|
|
1113
1126
|
return {
|
|
1114
1127
|
posts: state.posts,
|
|
1115
1128
|
media: state.media,
|
|
1116
1129
|
outputDir,
|
|
1117
1130
|
outputFiles,
|
|
1118
1131
|
issues: issueReport,
|
|
1119
|
-
cacheStats: this.config.cache ? state.cacheStats : void 0
|
|
1132
|
+
cacheStats: this.config.cache ? state.cacheStats : void 0,
|
|
1133
|
+
report
|
|
1120
1134
|
};
|
|
1121
1135
|
}
|
|
1122
1136
|
// --------------------------------------------------------------------------
|
|
1137
|
+
// Phase Timing
|
|
1138
|
+
// --------------------------------------------------------------------------
|
|
1139
|
+
async timed(state, phase, fn) {
|
|
1140
|
+
const start = Date.now();
|
|
1141
|
+
const result = await fn();
|
|
1142
|
+
const elapsed = Date.now() - start;
|
|
1143
|
+
state.phaseTiming.set(phase, elapsed);
|
|
1144
|
+
if (elapsed > 0) {
|
|
1145
|
+
this.log(`Phase "${phase}" completed in ${elapsed}ms`, "debug");
|
|
1146
|
+
}
|
|
1147
|
+
return result;
|
|
1148
|
+
}
|
|
1149
|
+
// --------------------------------------------------------------------------
|
|
1123
1150
|
// Media Processing
|
|
1124
1151
|
// --------------------------------------------------------------------------
|
|
1125
1152
|
async processMedia(state) {
|
|
@@ -1585,6 +1612,12 @@ var Processor = class {
|
|
|
1585
1612
|
} else {
|
|
1586
1613
|
this.log(`All ${cachedCount.hits} text embeddings loaded from cache`, "info");
|
|
1587
1614
|
}
|
|
1615
|
+
const postsWithEmbeddings = state.posts.filter((p) => p.embedding && p.embedding.length > 0);
|
|
1616
|
+
state.report.postEmbeddings = {
|
|
1617
|
+
filesProcessed: postsWithEmbeddings.length,
|
|
1618
|
+
dimensions: textEmbedder.dimensions,
|
|
1619
|
+
model: textEmbedder.model
|
|
1620
|
+
};
|
|
1588
1621
|
} catch (error) {
|
|
1589
1622
|
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
1590
1623
|
this.issues.addEmbeddingError({
|
|
@@ -1627,6 +1660,12 @@ var Processor = class {
|
|
|
1627
1660
|
}
|
|
1628
1661
|
}
|
|
1629
1662
|
this.log(`Image embeddings: ${generatedCount} generated, ${cachedCount} from cache`, "info");
|
|
1663
|
+
const mediaWithEmbeddings = state.media.filter((m) => m.embedding && m.embedding.length > 0);
|
|
1664
|
+
state.report.mediaEmbeddings = {
|
|
1665
|
+
filesProcessed: mediaWithEmbeddings.length,
|
|
1666
|
+
dimensions: imageEmbedder.dimensions,
|
|
1667
|
+
model: imageEmbedder.model
|
|
1668
|
+
};
|
|
1630
1669
|
}
|
|
1631
1670
|
}
|
|
1632
1671
|
// --------------------------------------------------------------------------
|
|
@@ -1641,12 +1680,18 @@ var Processor = class {
|
|
|
1641
1680
|
this.log("Generating similarity data...", "info");
|
|
1642
1681
|
try {
|
|
1643
1682
|
const result = await similarity.generateSimilarityMap(state.posts);
|
|
1644
|
-
const similarityPath = path5__default.default.join(state.outputDir,
|
|
1683
|
+
const similarityPath = path5__default.default.join(state.outputDir, chunk2TZSVPNP_cjs.OUTPUT_FILES.SIMILARITY);
|
|
1645
1684
|
await writeJson(similarityPath, {
|
|
1646
1685
|
pairwiseScores: Object.fromEntries(result.pairwiseScores),
|
|
1647
1686
|
similarPosts: Object.fromEntries(result.similarPosts),
|
|
1648
1687
|
metadata: result.metadata
|
|
1649
1688
|
});
|
|
1689
|
+
const simConfig = this.config.similarity;
|
|
1690
|
+
state.report.similarity = {
|
|
1691
|
+
pairsComputed: result.metadata.pairCount,
|
|
1692
|
+
topN: simConfig?.topN ?? 5,
|
|
1693
|
+
postsWithEmbeddings: result.metadata.postCount
|
|
1694
|
+
};
|
|
1650
1695
|
this.log(
|
|
1651
1696
|
`Generated similarity data: ${result.metadata.pairCount} pairs`,
|
|
1652
1697
|
"info"
|
|
@@ -1697,13 +1742,39 @@ var Processor = class {
|
|
|
1697
1742
|
slugMap[post.slug] = post.hash;
|
|
1698
1743
|
pathMap[post.originalPath] = post.hash;
|
|
1699
1744
|
}
|
|
1700
|
-
|
|
1745
|
+
const writePromises = [
|
|
1701
1746
|
writeJson(postsPath, posts),
|
|
1702
1747
|
writeJson(mediaPath, media),
|
|
1703
1748
|
writeJson(slugMapPath, slugMap),
|
|
1704
1749
|
writeJson(pathMapPath, pathMap),
|
|
1705
1750
|
writeJson(issuesPath, this.issues.generateReport())
|
|
1706
|
-
]
|
|
1751
|
+
];
|
|
1752
|
+
const textEmbeddingMap = {};
|
|
1753
|
+
let hasTextEmbeddings = false;
|
|
1754
|
+
for (const post of posts) {
|
|
1755
|
+
if (post.embedding && post.embedding.length > 0) {
|
|
1756
|
+
textEmbeddingMap[post.hash] = post.embedding;
|
|
1757
|
+
hasTextEmbeddings = true;
|
|
1758
|
+
}
|
|
1759
|
+
}
|
|
1760
|
+
if (hasTextEmbeddings) {
|
|
1761
|
+
const textEmbPath = path5__default.default.join(outputDir, chunk2TZSVPNP_cjs.OUTPUT_FILES.TEXT_EMBEDDINGS);
|
|
1762
|
+
writePromises.push(writeJson(textEmbPath, textEmbeddingMap));
|
|
1763
|
+
}
|
|
1764
|
+
const imageEmbeddingMap = {};
|
|
1765
|
+
let hasImageEmbeddings = false;
|
|
1766
|
+
for (const m of media) {
|
|
1767
|
+
const mediaHash = m.metadata?.hash;
|
|
1768
|
+
if (mediaHash && m.embedding && m.embedding.length > 0) {
|
|
1769
|
+
imageEmbeddingMap[mediaHash] = m.embedding;
|
|
1770
|
+
hasImageEmbeddings = true;
|
|
1771
|
+
}
|
|
1772
|
+
}
|
|
1773
|
+
if (hasImageEmbeddings) {
|
|
1774
|
+
const imageEmbPath = path5__default.default.join(outputDir, chunk2TZSVPNP_cjs.OUTPUT_FILES.IMAGE_EMBEDDINGS);
|
|
1775
|
+
writePromises.push(writeJson(imageEmbPath, imageEmbeddingMap));
|
|
1776
|
+
}
|
|
1777
|
+
await Promise.all(writePromises);
|
|
1707
1778
|
this.log(`Output written to ${outputDir}`, "info");
|
|
1708
1779
|
return {
|
|
1709
1780
|
posts: postsPath,
|
|
@@ -1,4 +1,5 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { OUTPUT_FILES } from './chunk-D26H5722.js';
|
|
2
|
+
import { PluginManager } from './chunk-B43UAOPC.js';
|
|
2
3
|
import { __require } from './chunk-3RG5ZIWI.js';
|
|
3
4
|
import { unified } from 'unified';
|
|
4
5
|
import remarkParse from 'remark-parse';
|
|
@@ -1057,6 +1058,7 @@ var Processor = class {
|
|
|
1057
1058
|
this.log(`Processing folder: ${inputDir}`, "info");
|
|
1058
1059
|
this.log(`Output directory: ${outputDir}`, "info");
|
|
1059
1060
|
await ensureDir(outputDir);
|
|
1061
|
+
const totalStart = Date.now();
|
|
1060
1062
|
const state = {
|
|
1061
1063
|
inputDir,
|
|
1062
1064
|
outputDir,
|
|
@@ -1072,18 +1074,20 @@ var Processor = class {
|
|
|
1072
1074
|
textEmbeddingCacheMisses: 0,
|
|
1073
1075
|
imageEmbeddingCacheHits: 0,
|
|
1074
1076
|
imageEmbeddingCacheMisses: 0
|
|
1075
|
-
}
|
|
1077
|
+
},
|
|
1078
|
+
report: {},
|
|
1079
|
+
phaseTiming: /* @__PURE__ */ new Map()
|
|
1076
1080
|
};
|
|
1077
1081
|
if (!this.config.media?.skip) {
|
|
1078
|
-
await this.processMedia(state);
|
|
1082
|
+
await this.timed(state, "media", () => this.processMedia(state));
|
|
1079
1083
|
} else {
|
|
1080
1084
|
this.log("Skipping media processing", "info");
|
|
1081
1085
|
}
|
|
1082
|
-
await this.processMarkdownFiles(state);
|
|
1083
|
-
await this.generateEmbeddings(state);
|
|
1084
|
-
await this.generateSimilarity(state);
|
|
1085
|
-
await this.buildDatabase(state);
|
|
1086
|
-
const outputFiles = await this.writeOutput(state);
|
|
1086
|
+
await this.timed(state, "markdown", () => this.processMarkdownFiles(state));
|
|
1087
|
+
await this.timed(state, "embeddings", () => this.generateEmbeddings(state));
|
|
1088
|
+
await this.timed(state, "similarity", () => this.generateSimilarity(state));
|
|
1089
|
+
await this.timed(state, "database", () => this.buildDatabase(state));
|
|
1090
|
+
const outputFiles = await this.timed(state, "output", () => this.writeOutput(state));
|
|
1087
1091
|
const issueReport = this.issues.generateReport();
|
|
1088
1092
|
this.log(this.issues.getSummaryString(), "info");
|
|
1089
1093
|
const hasCacheActivity = state.cacheStats.mediaCacheHits > 0 || state.cacheStats.mediaCacheMisses > 0 || state.cacheStats.textEmbeddingCacheHits > 0 || state.cacheStats.imageEmbeddingCacheHits > 0;
|
|
@@ -1093,16 +1097,39 @@ var Processor = class {
|
|
|
1093
1097
|
"info"
|
|
1094
1098
|
);
|
|
1095
1099
|
}
|
|
1100
|
+
const totalMs = Date.now() - totalStart;
|
|
1101
|
+
const phases = {};
|
|
1102
|
+
for (const [phase, ms] of state.phaseTiming) {
|
|
1103
|
+
phases[phase] = ms;
|
|
1104
|
+
}
|
|
1105
|
+
const report = {
|
|
1106
|
+
...state.report,
|
|
1107
|
+
timing: { totalMs, phases }
|
|
1108
|
+
};
|
|
1096
1109
|
return {
|
|
1097
1110
|
posts: state.posts,
|
|
1098
1111
|
media: state.media,
|
|
1099
1112
|
outputDir,
|
|
1100
1113
|
outputFiles,
|
|
1101
1114
|
issues: issueReport,
|
|
1102
|
-
cacheStats: this.config.cache ? state.cacheStats : void 0
|
|
1115
|
+
cacheStats: this.config.cache ? state.cacheStats : void 0,
|
|
1116
|
+
report
|
|
1103
1117
|
};
|
|
1104
1118
|
}
|
|
1105
1119
|
// --------------------------------------------------------------------------
|
|
1120
|
+
// Phase Timing
|
|
1121
|
+
// --------------------------------------------------------------------------
|
|
1122
|
+
async timed(state, phase, fn) {
|
|
1123
|
+
const start = Date.now();
|
|
1124
|
+
const result = await fn();
|
|
1125
|
+
const elapsed = Date.now() - start;
|
|
1126
|
+
state.phaseTiming.set(phase, elapsed);
|
|
1127
|
+
if (elapsed > 0) {
|
|
1128
|
+
this.log(`Phase "${phase}" completed in ${elapsed}ms`, "debug");
|
|
1129
|
+
}
|
|
1130
|
+
return result;
|
|
1131
|
+
}
|
|
1132
|
+
// --------------------------------------------------------------------------
|
|
1106
1133
|
// Media Processing
|
|
1107
1134
|
// --------------------------------------------------------------------------
|
|
1108
1135
|
async processMedia(state) {
|
|
@@ -1568,6 +1595,12 @@ var Processor = class {
|
|
|
1568
1595
|
} else {
|
|
1569
1596
|
this.log(`All ${cachedCount.hits} text embeddings loaded from cache`, "info");
|
|
1570
1597
|
}
|
|
1598
|
+
const postsWithEmbeddings = state.posts.filter((p) => p.embedding && p.embedding.length > 0);
|
|
1599
|
+
state.report.postEmbeddings = {
|
|
1600
|
+
filesProcessed: postsWithEmbeddings.length,
|
|
1601
|
+
dimensions: textEmbedder.dimensions,
|
|
1602
|
+
model: textEmbedder.model
|
|
1603
|
+
};
|
|
1571
1604
|
} catch (error) {
|
|
1572
1605
|
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
1573
1606
|
this.issues.addEmbeddingError({
|
|
@@ -1610,6 +1643,12 @@ var Processor = class {
|
|
|
1610
1643
|
}
|
|
1611
1644
|
}
|
|
1612
1645
|
this.log(`Image embeddings: ${generatedCount} generated, ${cachedCount} from cache`, "info");
|
|
1646
|
+
const mediaWithEmbeddings = state.media.filter((m) => m.embedding && m.embedding.length > 0);
|
|
1647
|
+
state.report.mediaEmbeddings = {
|
|
1648
|
+
filesProcessed: mediaWithEmbeddings.length,
|
|
1649
|
+
dimensions: imageEmbedder.dimensions,
|
|
1650
|
+
model: imageEmbedder.model
|
|
1651
|
+
};
|
|
1613
1652
|
}
|
|
1614
1653
|
}
|
|
1615
1654
|
// --------------------------------------------------------------------------
|
|
@@ -1624,12 +1663,18 @@ var Processor = class {
|
|
|
1624
1663
|
this.log("Generating similarity data...", "info");
|
|
1625
1664
|
try {
|
|
1626
1665
|
const result = await similarity.generateSimilarityMap(state.posts);
|
|
1627
|
-
const similarityPath = path5.join(state.outputDir,
|
|
1666
|
+
const similarityPath = path5.join(state.outputDir, OUTPUT_FILES.SIMILARITY);
|
|
1628
1667
|
await writeJson(similarityPath, {
|
|
1629
1668
|
pairwiseScores: Object.fromEntries(result.pairwiseScores),
|
|
1630
1669
|
similarPosts: Object.fromEntries(result.similarPosts),
|
|
1631
1670
|
metadata: result.metadata
|
|
1632
1671
|
});
|
|
1672
|
+
const simConfig = this.config.similarity;
|
|
1673
|
+
state.report.similarity = {
|
|
1674
|
+
pairsComputed: result.metadata.pairCount,
|
|
1675
|
+
topN: simConfig?.topN ?? 5,
|
|
1676
|
+
postsWithEmbeddings: result.metadata.postCount
|
|
1677
|
+
};
|
|
1633
1678
|
this.log(
|
|
1634
1679
|
`Generated similarity data: ${result.metadata.pairCount} pairs`,
|
|
1635
1680
|
"info"
|
|
@@ -1680,13 +1725,39 @@ var Processor = class {
|
|
|
1680
1725
|
slugMap[post.slug] = post.hash;
|
|
1681
1726
|
pathMap[post.originalPath] = post.hash;
|
|
1682
1727
|
}
|
|
1683
|
-
|
|
1728
|
+
const writePromises = [
|
|
1684
1729
|
writeJson(postsPath, posts),
|
|
1685
1730
|
writeJson(mediaPath, media),
|
|
1686
1731
|
writeJson(slugMapPath, slugMap),
|
|
1687
1732
|
writeJson(pathMapPath, pathMap),
|
|
1688
1733
|
writeJson(issuesPath, this.issues.generateReport())
|
|
1689
|
-
]
|
|
1734
|
+
];
|
|
1735
|
+
const textEmbeddingMap = {};
|
|
1736
|
+
let hasTextEmbeddings = false;
|
|
1737
|
+
for (const post of posts) {
|
|
1738
|
+
if (post.embedding && post.embedding.length > 0) {
|
|
1739
|
+
textEmbeddingMap[post.hash] = post.embedding;
|
|
1740
|
+
hasTextEmbeddings = true;
|
|
1741
|
+
}
|
|
1742
|
+
}
|
|
1743
|
+
if (hasTextEmbeddings) {
|
|
1744
|
+
const textEmbPath = path5.join(outputDir, OUTPUT_FILES.TEXT_EMBEDDINGS);
|
|
1745
|
+
writePromises.push(writeJson(textEmbPath, textEmbeddingMap));
|
|
1746
|
+
}
|
|
1747
|
+
const imageEmbeddingMap = {};
|
|
1748
|
+
let hasImageEmbeddings = false;
|
|
1749
|
+
for (const m of media) {
|
|
1750
|
+
const mediaHash = m.metadata?.hash;
|
|
1751
|
+
if (mediaHash && m.embedding && m.embedding.length > 0) {
|
|
1752
|
+
imageEmbeddingMap[mediaHash] = m.embedding;
|
|
1753
|
+
hasImageEmbeddings = true;
|
|
1754
|
+
}
|
|
1755
|
+
}
|
|
1756
|
+
if (hasImageEmbeddings) {
|
|
1757
|
+
const imageEmbPath = path5.join(outputDir, OUTPUT_FILES.IMAGE_EMBEDDINGS);
|
|
1758
|
+
writePromises.push(writeJson(imageEmbPath, imageEmbeddingMap));
|
|
1759
|
+
}
|
|
1760
|
+
await Promise.all(writePromises);
|
|
1690
1761
|
this.log(`Output written to ${outputDir}`, "info");
|
|
1691
1762
|
return {
|
|
1692
1763
|
posts: postsPath,
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
import { existsSync, readFileSync } from 'fs';
|
|
2
|
+
|
|
3
|
+
// src/utils.ts
|
|
4
|
+
function loadJsonFile(path, fallback) {
|
|
5
|
+
if (!existsSync(path)) return fallback;
|
|
6
|
+
return JSON.parse(readFileSync(path, "utf-8"));
|
|
7
|
+
}
|
|
8
|
+
function normalizeBaseUrl(domain) {
|
|
9
|
+
let url = domain.trim();
|
|
10
|
+
if (url.endsWith("/")) url = url.slice(0, -1);
|
|
11
|
+
if (!url.startsWith("http://") && !url.startsWith("https://")) {
|
|
12
|
+
url = `https://${url}`;
|
|
13
|
+
}
|
|
14
|
+
return url;
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
export { loadJsonFile, normalizeBaseUrl };
|