@llm-newsletter-kit/core 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +73 -0
- package/NOTICE +27 -0
- package/README.md +240 -0
- package/dist/index.cjs +1757 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.ts +828 -0
- package/dist/index.js +1755 -0
- package/dist/index.js.map +1 -0
- package/package.json +111 -0
package/dist/index.js
ADDED
|
@@ -0,0 +1,1755 @@
|
|
|
1
|
+
import { RunnablePassthrough, RunnableSequence } from '@langchain/core/runnables';
|
|
2
|
+
import { pick, omit } from 'es-toolkit';
|
|
3
|
+
import { generateObject } from 'ai';
|
|
4
|
+
import { z } from 'zod';
|
|
5
|
+
import juice from 'juice';
|
|
6
|
+
import DOMPurify from 'dompurify';
|
|
7
|
+
import { JSDOM } from 'jsdom';
|
|
8
|
+
import { marked } from 'marked';
|
|
9
|
+
import { randomUUID } from 'node:crypto';
|
|
10
|
+
|
|
11
|
+
/**
|
|
12
|
+
* Common type aliases.
|
|
13
|
+
*
|
|
14
|
+
* - Provides explicit alias types for date/URL/Markdown/HTML, etc.
|
|
15
|
+
* - All comments are written in English JSDoc style.
|
|
16
|
+
*/
|
|
17
|
+
/**
|
|
18
|
+
* Type for date identifiers.
|
|
19
|
+
*
|
|
20
|
+
* The DateType enum is used to distinguish date-related values.
|
|
21
|
+
* It can be used to differentiate between registered dates and ranges.
|
|
22
|
+
*
|
|
23
|
+
* Enum members:
|
|
24
|
+
* - REGISTERED: indicates a registered date.
|
|
25
|
+
* - DURATION: indicates a duration or time range.
|
|
26
|
+
*
|
|
27
|
+
* @example
|
|
28
|
+
* ```ts
|
|
29
|
+
* const type: DateType = DateType.REGISTERED;
|
|
30
|
+
* ```
|
|
31
|
+
*/
|
|
32
|
+
var DateType;
|
|
33
|
+
(function (DateType) {
|
|
34
|
+
DateType["REGISTERED"] = "registered";
|
|
35
|
+
DateType["DURATION"] = "duration";
|
|
36
|
+
})(DateType || (DateType = {}));
|
|
37
|
+
|
|
38
|
+
/**
|
|
39
|
+
* Executor that provides a standardized start/done/error logging pattern.
|
|
40
|
+
* - Uses the injected logger and taskId to attach common fields to every log.
|
|
41
|
+
* - Pass config.event as a prefix like "crawl.group";
|
|
42
|
+
* ".start"/".done"/".error" are appended automatically.
|
|
43
|
+
*/
|
|
44
|
+
class LoggingExecutor {
|
|
45
|
+
logger;
|
|
46
|
+
taskId;
|
|
47
|
+
constructor(logger, taskId) {
|
|
48
|
+
this.logger = logger;
|
|
49
|
+
this.taskId = taskId;
|
|
50
|
+
}
|
|
51
|
+
async executeWithLogging(config, fn) {
|
|
52
|
+
const level = config.level ?? 'debug';
|
|
53
|
+
const startedAt = Date.now();
|
|
54
|
+
const startMsg = {
|
|
55
|
+
event: `${config.event}.start`,
|
|
56
|
+
level,
|
|
57
|
+
taskId: this.taskId,
|
|
58
|
+
data: config.startFields ?? {},
|
|
59
|
+
};
|
|
60
|
+
this.logger[level](startMsg);
|
|
61
|
+
try {
|
|
62
|
+
const result = await fn();
|
|
63
|
+
const durationMs = Date.now() - startedAt;
|
|
64
|
+
const doneExtra = config.doneFields
|
|
65
|
+
? (config.doneFields(result) ?? {})
|
|
66
|
+
: {};
|
|
67
|
+
const doneMsg = {
|
|
68
|
+
event: `${config.event}.done`,
|
|
69
|
+
level,
|
|
70
|
+
taskId: this.taskId,
|
|
71
|
+
durationMs,
|
|
72
|
+
data: { ...(config.startFields ?? {}), ...doneExtra },
|
|
73
|
+
};
|
|
74
|
+
this.logger[level](doneMsg);
|
|
75
|
+
return result;
|
|
76
|
+
}
|
|
77
|
+
catch (err) {
|
|
78
|
+
const durationMs = Date.now() - startedAt;
|
|
79
|
+
const errorMsg = {
|
|
80
|
+
event: `${config.event}.error`,
|
|
81
|
+
level,
|
|
82
|
+
taskId: this.taskId,
|
|
83
|
+
durationMs,
|
|
84
|
+
data: { ...(config.startFields ?? {}) },
|
|
85
|
+
};
|
|
86
|
+
this.logger[level](errorMsg);
|
|
87
|
+
this.logger.error(err);
|
|
88
|
+
throw err;
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
const ensureStringArray = (value) => {
|
|
94
|
+
return typeof value === 'string' ? [value] : value;
|
|
95
|
+
};
|
|
96
|
+
|
|
97
|
+
class BaseLLMQuery {
|
|
98
|
+
model;
|
|
99
|
+
expertFields;
|
|
100
|
+
logger;
|
|
101
|
+
taskId;
|
|
102
|
+
options;
|
|
103
|
+
executeWithLogging;
|
|
104
|
+
constructor(config) {
|
|
105
|
+
this.model = config.model;
|
|
106
|
+
this.expertFields = ensureStringArray(config.options.content.expertField);
|
|
107
|
+
this.logger = config.logger;
|
|
108
|
+
this.taskId = config.taskId;
|
|
109
|
+
this.options = config.options;
|
|
110
|
+
this.executeWithLogging = config.loggingExecutor.executeWithLogging.bind(config.loggingExecutor);
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
class LLMQuery extends BaseLLMQuery {
|
|
114
|
+
targetArticle;
|
|
115
|
+
constructor(config) {
|
|
116
|
+
super(config);
|
|
117
|
+
this.targetArticle = config.targetArticle;
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
class AnalyzeImages extends LLMQuery {
|
|
122
|
+
schema = z.object({
|
|
123
|
+
imageContext: z
|
|
124
|
+
.string()
|
|
125
|
+
.describe('A comprehensive description of all information extracted from the images'),
|
|
126
|
+
});
|
|
127
|
+
constructor(config) {
|
|
128
|
+
super(config);
|
|
129
|
+
}
|
|
130
|
+
async execute() {
|
|
131
|
+
if (!this.targetArticle.hasAttachedImage ||
|
|
132
|
+
!this.targetArticle.detailContent) {
|
|
133
|
+
return null;
|
|
134
|
+
}
|
|
135
|
+
if (this.imageMessages.length === 0) {
|
|
136
|
+
return null;
|
|
137
|
+
}
|
|
138
|
+
const { object } = await generateObject({
|
|
139
|
+
model: this.model,
|
|
140
|
+
maxRetries: this.options.llm.maxRetries,
|
|
141
|
+
schema: this.schema,
|
|
142
|
+
system: this.systemPrompt,
|
|
143
|
+
messages: [
|
|
144
|
+
{
|
|
145
|
+
role: 'user',
|
|
146
|
+
content: [this.textMessage, ...this.imageMessages],
|
|
147
|
+
},
|
|
148
|
+
],
|
|
149
|
+
});
|
|
150
|
+
return object.imageContext;
|
|
151
|
+
}
|
|
152
|
+
get systemPrompt() {
|
|
153
|
+
return `# Image Analysis Expert System
|
|
154
|
+
|
|
155
|
+
## Identity & Expertise
|
|
156
|
+
You are a specialized image analysis expert in: ${this.expertFields.join(', ')}
|
|
157
|
+
|
|
158
|
+
## Core Responsibilities
|
|
159
|
+
1. Extract visual information unavailable from text alone
|
|
160
|
+
2. Identify industry-specific elements, facilities, and stakeholders
|
|
161
|
+
3. Accurately read and transcribe text, charts, and data visualizations
|
|
162
|
+
4. Synthesize visual information with article context
|
|
163
|
+
|
|
164
|
+
## Analysis Framework
|
|
165
|
+
|
|
166
|
+
### Information Categories to Extract
|
|
167
|
+
- Industry-relevant visual elements
|
|
168
|
+
- Text and numerical data within images
|
|
169
|
+
- Key subjects (people, places, objects, infrastructure)
|
|
170
|
+
- Contextual relationships to ${this.expertFields.join(', ')}
|
|
171
|
+
- Information gaps filled by visual analysis
|
|
172
|
+
|
|
173
|
+
### Quality Standards
|
|
174
|
+
- Accuracy and specificity in descriptions
|
|
175
|
+
- Professional relevance for industry practitioners
|
|
176
|
+
- Integration with accompanying text content
|
|
177
|
+
- Completeness in covering all visual information
|
|
178
|
+
|
|
179
|
+
## Output Specifications
|
|
180
|
+
- Language: ${this.options.content.outputLanguage}
|
|
181
|
+
- Format: Single cohesive explanation (not numbered list)
|
|
182
|
+
- Focus: Practical insights for industry professionals
|
|
183
|
+
- Integration: Seamlessly merge all extracted information`;
|
|
184
|
+
}
|
|
185
|
+
get imageUrls() {
|
|
186
|
+
// Markdown image pattern:  or 
|
|
187
|
+
// Includes http, https, relative paths, and data URIs
|
|
188
|
+
const imageRegex = /!\[.*?\]\(([^)]+)\)/g;
|
|
189
|
+
const urls = [];
|
|
190
|
+
let match;
|
|
191
|
+
while ((match = imageRegex.exec(this.targetArticle.detailContent)) !== null) {
|
|
192
|
+
const url = match[1].trim();
|
|
193
|
+
// Validate URL format (http, https, relative path, data URI)
|
|
194
|
+
if (url &&
|
|
195
|
+
(url.startsWith('http://') ||
|
|
196
|
+
url.startsWith('https://') ||
|
|
197
|
+
url.startsWith('//') || // Protocol-relative URL
|
|
198
|
+
url.startsWith('/') || // Absolute path
|
|
199
|
+
url.startsWith('./') || // Relative path
|
|
200
|
+
url.startsWith('../') || // Parent directory relative path
|
|
201
|
+
url.startsWith('data:image/')) // Data URI
|
|
202
|
+
) {
|
|
203
|
+
urls.push(url);
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
// Process max 5 images only (to save cost)
|
|
207
|
+
return urls.slice(0, 5);
|
|
208
|
+
}
|
|
209
|
+
get imageMessages() {
|
|
210
|
+
return this.imageUrls.map((url) => ({
|
|
211
|
+
type: 'image',
|
|
212
|
+
image: url,
|
|
213
|
+
}));
|
|
214
|
+
}
|
|
215
|
+
get textMessage() {
|
|
216
|
+
return {
|
|
217
|
+
type: 'text',
|
|
218
|
+
text: `## Analysis Task
|
|
219
|
+
|
|
220
|
+
**Document Context:**
|
|
221
|
+
- Title: ${this.targetArticle.title}
|
|
222
|
+
- Content: ${this.targetArticle.detailContent}
|
|
223
|
+
|
|
224
|
+
## Instructions
|
|
225
|
+
|
|
226
|
+
Analyze the provided images and synthesize your findings into a single comprehensive explanation that:
|
|
227
|
+
|
|
228
|
+
1. **Identifies Visual Content**: Extract industry-specific elements, infrastructure, and stakeholders relevant to ${this.expertFields.join(', ')}
|
|
229
|
+
|
|
230
|
+
2. **Captures Text & Data**: Accurately read and include all visible text, numerical data, charts, and graphs
|
|
231
|
+
|
|
232
|
+
3. **Describes Visual Elements**: Detail important subjects (people, places, objects) and their significance
|
|
233
|
+
|
|
234
|
+
4. **Establishes Connections**: Link visual information to ${this.expertFields.join(', ')} context and article content
|
|
235
|
+
|
|
236
|
+
5. **Provides Context**: Explain what industry professionals should understand from these images
|
|
237
|
+
|
|
238
|
+
6. **Complements Text**: Add visual insights not covered in the article text
|
|
239
|
+
|
|
240
|
+
**Format**: Present all findings as one flowing narrative without enumeration.`,
|
|
241
|
+
};
|
|
242
|
+
}
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
class ClassifyTags extends LLMQuery {
|
|
246
|
+
schema = z.object({
|
|
247
|
+
tag1: z.string(),
|
|
248
|
+
tag2: z.string(),
|
|
249
|
+
tag3: z.string(),
|
|
250
|
+
});
|
|
251
|
+
existTags = [];
|
|
252
|
+
constructor(config) {
|
|
253
|
+
super(config);
|
|
254
|
+
}
|
|
255
|
+
async execute({ existTags }) {
|
|
256
|
+
this.existTags = existTags;
|
|
257
|
+
const { object } = await generateObject({
|
|
258
|
+
model: this.model,
|
|
259
|
+
maxRetries: this.options.llm.maxRetries,
|
|
260
|
+
schema: this.schema,
|
|
261
|
+
system: this.systemPrompt,
|
|
262
|
+
prompt: this.userPrompt,
|
|
263
|
+
});
|
|
264
|
+
return object;
|
|
265
|
+
}
|
|
266
|
+
get systemPrompt() {
|
|
267
|
+
return `You are an AI specializing in analyzing and categorizing articles for professionals in ${this.expertFields.join(', ')}.
|
|
268
|
+
|
|
269
|
+
## Core Responsibility
|
|
270
|
+
Analyze article titles and content to generate 3 optimal, detailed classifications by evaluating compatibility with existing tags and determining when new tags are justified.
|
|
271
|
+
|
|
272
|
+
## Output Language
|
|
273
|
+
All classifications must be written in ${this.options.content.outputLanguage}.
|
|
274
|
+
|
|
275
|
+
## Classification Rules
|
|
276
|
+
1. **Reuse Threshold**: Use existing classifications if compatibility is 80% or higher
|
|
277
|
+
2. **New Tag Criteria**: Create new classifications only when:
|
|
278
|
+
- Best existing match scores below 80% compatibility
|
|
279
|
+
- New tag demonstrates versatility across 10+ similar articles
|
|
280
|
+
3. **Naming Standards**:
|
|
281
|
+
- Length: 3-15 characters
|
|
282
|
+
- Style: Clear, intuitive ${this.options.content.outputLanguage} terms
|
|
283
|
+
- Balance industry precision with general reader comprehension
|
|
284
|
+
4. **Scope Exclusion**: Avoid broad, general tags like ${this.expertFields.map((v) => `"${v}"`).join(', ')} (too generic for this expert audience)
|
|
285
|
+
|
|
286
|
+
## Decision Framework
|
|
287
|
+
Prioritize in order:
|
|
288
|
+
- Content accuracy and relevance
|
|
289
|
+
- Classification system consistency
|
|
290
|
+
- User intuitiveness and searchability
|
|
291
|
+
- Long-term scalability and maintainability`;
|
|
292
|
+
}
|
|
293
|
+
get userPrompt() {
|
|
294
|
+
return `**Task**: Classify this article with 3 optimal detailed tags.
|
|
295
|
+
|
|
296
|
+
**Article Information**
|
|
297
|
+
- Title: ${this.targetArticle.title}
|
|
298
|
+
- Content: ${this.targetArticle.detailContent}
|
|
299
|
+
|
|
300
|
+
**Available Existing Tags**
|
|
301
|
+
\`\`\`
|
|
302
|
+
${JSON.stringify(this.existTags, null, 2)}
|
|
303
|
+
\`\`\`
|
|
304
|
+
|
|
305
|
+
**Analysis Steps**
|
|
306
|
+
1. Extract core concepts, industry sectors, and specific topics from the article
|
|
307
|
+
2. Score each existing tag for compatibility (0-100%)
|
|
308
|
+
3. Identify which existing tags meet the 80%+ threshold
|
|
309
|
+
4. For tags below 80%, determine if a new tag would better serve the content
|
|
310
|
+
5. Validate new tag names for clarity, length, and future applicability
|
|
311
|
+
|
|
312
|
+
**Output Requirements**
|
|
313
|
+
Return exactly 3 classifications following the system rules. Each classification should:
|
|
314
|
+
- Match the article's core content
|
|
315
|
+
- Fit logically within the overall classification system
|
|
316
|
+
- Be immediately understandable to your target audience
|
|
317
|
+
- Support future similar articles`;
|
|
318
|
+
}
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
class DetermineArticleImportance extends LLMQuery {
|
|
322
|
+
minimumImportanceScoreRules;
|
|
323
|
+
schema = z.object({
|
|
324
|
+
importanceScore: z
|
|
325
|
+
.number()
|
|
326
|
+
.min(1)
|
|
327
|
+
.max(10)
|
|
328
|
+
.describe('Article importance score (1-10, 10 is most important)'),
|
|
329
|
+
});
|
|
330
|
+
dateService;
|
|
331
|
+
constructor(config) {
|
|
332
|
+
super(config);
|
|
333
|
+
this.minimumImportanceScoreRules = config.minimumImportanceScoreRules ?? [];
|
|
334
|
+
this.dateService = config.dateService;
|
|
335
|
+
}
|
|
336
|
+
async execute() {
|
|
337
|
+
const { object } = await generateObject({
|
|
338
|
+
model: this.model,
|
|
339
|
+
maxRetries: this.options.llm.maxRetries,
|
|
340
|
+
schema: this.schema,
|
|
341
|
+
system: this.systemPrompt,
|
|
342
|
+
prompt: this.userPrompt,
|
|
343
|
+
});
|
|
344
|
+
return object.importanceScore;
|
|
345
|
+
}
|
|
346
|
+
get minPoint() {
|
|
347
|
+
const targetRule = this.minimumImportanceScoreRules.find(({ targetUrl }) => targetUrl === this.targetArticle.targetUrl);
|
|
348
|
+
return targetRule?.minScore ?? 1;
|
|
349
|
+
}
|
|
350
|
+
get hasHigherMinimumScore() {
|
|
351
|
+
return this.minPoint > 1;
|
|
352
|
+
}
|
|
353
|
+
get systemPrompt() {
|
|
354
|
+
return `You are an expert in importance evaluation in the field of ${this.expertFields.join(', ')}.
|
|
355
|
+
|
|
356
|
+
Role:
|
|
357
|
+
- Analyze titles and content in depth to objectively evaluate the importance of news and announcements.
|
|
358
|
+
- Extract the most important insights for industry professionals. Main readers are practitioners from research institutions, local/public officials, graduate students, and field experts in ${this.expertFields.join(', ')}.
|
|
359
|
+
- Score based on urgency, impact, and scarcity of information.
|
|
360
|
+
|
|
361
|
+
Importance Score Criteria (${this.minPoint}-10):
|
|
362
|
+
10: Information with immediate and significant impact on entire industry (e.g., major legislation passed, large budget allocation, critical discoveries/events that transform the field)
|
|
363
|
+
8-9: Information with important impact on many stakeholders (e.g., major policy changes, major findings/achievements released, large project announcements)
|
|
364
|
+
7-8: Very important academic/professional achievements or information in specific fields (e.g., journal publication/release, major research results announcement, professional report publication, important academic events, research database construction/release, designation of important field resources/assets, medium-scale bid information)
|
|
365
|
+
5-6: General important information limited to specific fields or regions (e.g., small project permits, general event notices, small-scale bids)
|
|
366
|
+
4-5: General industry news or small/medium-scale event information
|
|
367
|
+
2-3: Simple information sharing or repetitive daily news
|
|
368
|
+
${this.hasHigherMinimumScore ? '' : `1: Information without current significance - Expired support programs, past events, invalid bid notices or recruitment information, notices that have lost practical value, or administrative/simple notices like "membership fee status", "meeting minutes", "internal schedule notices"`}
|
|
369
|
+
|
|
370
|
+
Evaluation Criteria:
|
|
371
|
+
- Academic Value: Journal publications, research reports, academic seminars/symposiums, research output presentations etc. minimum 7 points (knowledge base expansion and long-term reference value)
|
|
372
|
+
- Practical Impact: Information requiring immediate response like policies, regulations, bids, recruitment
|
|
373
|
+
- Impact Range: How many stakeholders are affected
|
|
374
|
+
- Scarcity: How rare and exclusive the information is
|
|
375
|
+
- Temporal Context: Practical value at current time considering deadlines, event schedules${this.hasHigherMinimumScore ? '' : ' (However, recent academic achievements maintain high scores)'}
|
|
376
|
+
|
|
377
|
+
Important Notes:
|
|
378
|
+
- Evaluate considering characteristics and context of ${this.expertFields.join(', ')} fields.
|
|
379
|
+
- Be sensitive to core keywords, events, policies considered important in the field.`;
|
|
380
|
+
}
|
|
381
|
+
get userPrompt() {
|
|
382
|
+
return `Please rate the importance of this article from ${this.minPoint} to 10.
|
|
383
|
+
|
|
384
|
+
**Current Date:** ${this.dateService.getCurrentISODateString()}
|
|
385
|
+
|
|
386
|
+
**Title:** ${this.targetArticle.title || 'No Title'}
|
|
387
|
+
|
|
388
|
+
**Content:** ${this.targetArticle.detailContent || 'No Content'}
|
|
389
|
+
|
|
390
|
+
**Tags:** ${this.targetArticle.tag1 || ''}, ${this.targetArticle.tag2 || ''}, ${this.targetArticle.tag3 || ''}
|
|
391
|
+
${this.targetArticle.imageContextByLlm
|
|
392
|
+
? `
|
|
393
|
+
**Image Analysis:** ${this.targetArticle.imageContextByLlm}`
|
|
394
|
+
: ''}`;
|
|
395
|
+
}
|
|
396
|
+
}
|
|
397
|
+
|
|
398
|
+
class BaseChain {
|
|
399
|
+
logger;
|
|
400
|
+
taskId;
|
|
401
|
+
provider;
|
|
402
|
+
options;
|
|
403
|
+
executeWithLogging;
|
|
404
|
+
constructor(config) {
|
|
405
|
+
this.logger = config.logger;
|
|
406
|
+
this.taskId = config.taskId;
|
|
407
|
+
this.provider = config.provider;
|
|
408
|
+
this.options = config.options;
|
|
409
|
+
this.executeWithLogging = config.loggingExecutor.executeWithLogging.bind(config.loggingExecutor);
|
|
410
|
+
}
|
|
411
|
+
}
|
|
412
|
+
class Chain extends BaseChain {
|
|
413
|
+
}
|
|
414
|
+
class PrivateChain extends BaseChain {
|
|
415
|
+
}
|
|
416
|
+
|
|
417
|
+
class ArticleInsightsChain extends PrivateChain {
|
|
418
|
+
dateService;
|
|
419
|
+
constructor(config) {
|
|
420
|
+
super(config);
|
|
421
|
+
this.dateService = config.dateService;
|
|
422
|
+
}
|
|
423
|
+
/* istanbul ignore next - pipeline arrow functions are exercised via higher-level tests */
|
|
424
|
+
get chain() {
|
|
425
|
+
return RunnablePassthrough.assign({
|
|
426
|
+
generatedTags: () => this.classifyArticles(),
|
|
427
|
+
generatedImageContextList: () => this.extractImageContext(),
|
|
428
|
+
})
|
|
429
|
+
.pipe({
|
|
430
|
+
mergedArticles: ({ generatedTags, generatedImageContextList }) => this.mergeTagsAndImageContext(generatedTags, generatedImageContextList),
|
|
431
|
+
})
|
|
432
|
+
.pipe({
|
|
433
|
+
determinedArticles: ({ mergedArticles }) => this.determineImportance(mergedArticles),
|
|
434
|
+
});
|
|
435
|
+
}
|
|
436
|
+
async generateInsights() {
|
|
437
|
+
const { determinedArticles: initial } = await this.chain.invoke({});
|
|
438
|
+
const maxIterations = 5; // Maximum number of iterations to prevent infinite loop
|
|
439
|
+
const reprocess = async (current, iteration) => {
|
|
440
|
+
if (iteration >= maxIterations) {
|
|
441
|
+
this.logger.debug({
|
|
442
|
+
event: 'insights.warning.maxIterationsReached',
|
|
443
|
+
taskId: this.taskId,
|
|
444
|
+
data: {
|
|
445
|
+
iterationCount: iteration,
|
|
446
|
+
maxIterationCount: maxIterations,
|
|
447
|
+
},
|
|
448
|
+
});
|
|
449
|
+
return current;
|
|
450
|
+
}
|
|
451
|
+
// Filter incomplete posts (where any of tag1, tag2, tag3, or importance_score is null)
|
|
452
|
+
const incompleteArticles = current.filter((article) => !article.tag1 ||
|
|
453
|
+
!article.tag2 ||
|
|
454
|
+
!article.tag3 ||
|
|
455
|
+
!article.importanceScore);
|
|
456
|
+
if (incompleteArticles.length === 0) {
|
|
457
|
+
this.logger.debug({
|
|
458
|
+
event: 'insights.complete',
|
|
459
|
+
taskId: this.taskId,
|
|
460
|
+
});
|
|
461
|
+
return current; // Exit when all posts have been fully processed
|
|
462
|
+
}
|
|
463
|
+
this.logger.debug({
|
|
464
|
+
event: 'insights.incomplete.restart',
|
|
465
|
+
taskId: this.taskId,
|
|
466
|
+
data: {
|
|
467
|
+
incompleteArticleCount: incompleteArticles.length,
|
|
468
|
+
iterationCount: iteration,
|
|
469
|
+
},
|
|
470
|
+
});
|
|
471
|
+
// Reprocess incomplete posts only by reusing an insight object
|
|
472
|
+
const { determinedArticles: reprocessedArticles } = await this.chain.invoke({});
|
|
473
|
+
// Update original determinedPosts with reprocessed articles
|
|
474
|
+
const updated = current.map((article) => {
|
|
475
|
+
const reprocessedArticle = reprocessedArticles.find((reArticle) => reArticle.id === article.id);
|
|
476
|
+
return reprocessedArticle || article;
|
|
477
|
+
});
|
|
478
|
+
this.logger.debug({
|
|
479
|
+
event: 'insights.incomplete.restart.done',
|
|
480
|
+
taskId: this.taskId,
|
|
481
|
+
data: {
|
|
482
|
+
iterationCount: iteration,
|
|
483
|
+
},
|
|
484
|
+
});
|
|
485
|
+
return reprocess(updated, iteration + 1);
|
|
486
|
+
};
|
|
487
|
+
return reprocess(initial, 0);
|
|
488
|
+
}
|
|
489
|
+
async classifyArticles() {
|
|
490
|
+
return this.executeWithLogging({
|
|
491
|
+
event: 'insights.articles.classify',
|
|
492
|
+
level: 'debug',
|
|
493
|
+
doneFields: (articles) => ({ count: articles }),
|
|
494
|
+
}, async () => {
|
|
495
|
+
const pushTag = (tag) => {
|
|
496
|
+
if (tag && !this.provider.tags.includes(tag)) {
|
|
497
|
+
this.provider.tags.push(tag);
|
|
498
|
+
}
|
|
499
|
+
};
|
|
500
|
+
const articlesWithTags = [];
|
|
501
|
+
for (const [i, article] of this.provider.unscoredArticles.entries()) {
|
|
502
|
+
const existTags = this.provider.tags;
|
|
503
|
+
if (article.tag1 && article.tag2 && article.tag3) {
|
|
504
|
+
continue;
|
|
505
|
+
}
|
|
506
|
+
this.logger.debug({
|
|
507
|
+
event: 'insights.articles.classify.start',
|
|
508
|
+
taskId: this.taskId,
|
|
509
|
+
data: {
|
|
510
|
+
count: `${i + 1} / ${this.provider.unscoredArticles.length}`,
|
|
511
|
+
articleId: article.id,
|
|
512
|
+
title: article.title?.substring(0, 50) + '...',
|
|
513
|
+
existingTags: existTags.length,
|
|
514
|
+
},
|
|
515
|
+
});
|
|
516
|
+
try {
|
|
517
|
+
const classifyTags = new ClassifyTags(this.getLlmQueryConfig(this.provider.classifyTagOptions.model, article));
|
|
518
|
+
const generatedTags = await classifyTags.execute({ existTags });
|
|
519
|
+
pushTag(generatedTags.tag1);
|
|
520
|
+
pushTag(generatedTags.tag2);
|
|
521
|
+
pushTag(generatedTags.tag3);
|
|
522
|
+
this.logger.debug({
|
|
523
|
+
event: 'insights.articles.classify.end',
|
|
524
|
+
taskId: this.taskId,
|
|
525
|
+
data: {
|
|
526
|
+
count: `${i + 1} / ${this.provider.unscoredArticles.length}`,
|
|
527
|
+
articleId: article.id,
|
|
528
|
+
result: `tag1: ${generatedTags.tag1}, tag2: ${generatedTags.tag2}, tag3: ${generatedTags.tag3}`,
|
|
529
|
+
},
|
|
530
|
+
});
|
|
531
|
+
articlesWithTags.push({
|
|
532
|
+
id: article.id,
|
|
533
|
+
...generatedTags,
|
|
534
|
+
});
|
|
535
|
+
}
|
|
536
|
+
catch (error) {
|
|
537
|
+
this.logger.debug({
|
|
538
|
+
event: 'insights.articles.classify.end.error',
|
|
539
|
+
taskId: this.taskId,
|
|
540
|
+
data: {
|
|
541
|
+
count: `${i + 1} / ${this.provider.unscoredArticles.length}`,
|
|
542
|
+
articleId: article.id,
|
|
543
|
+
error: error instanceof Error ? error.message : String(error),
|
|
544
|
+
title: article.title?.substring(0, 50) + '...',
|
|
545
|
+
},
|
|
546
|
+
});
|
|
547
|
+
// NOTE: Despite the error, it does not significantly hinder newsletter generation, so we proceed. Tagging helps produce a better newsletter, but it is not strictly required.
|
|
548
|
+
}
|
|
549
|
+
}
|
|
550
|
+
return articlesWithTags;
|
|
551
|
+
});
|
|
552
|
+
}
|
|
553
|
+
async extractImageContext() {
|
|
554
|
+
return this.executeWithLogging({
|
|
555
|
+
event: 'insights.images.extract',
|
|
556
|
+
level: 'debug',
|
|
557
|
+
doneFields: (articles) => ({ articles }),
|
|
558
|
+
}, async () => {
|
|
559
|
+
const articlesWithImageContext = [];
|
|
560
|
+
for (const [i, article] of this.provider.unscoredArticles.entries()) {
|
|
561
|
+
if (!article.hasAttachedImage) {
|
|
562
|
+
this.logger.debug({
|
|
563
|
+
event: 'insights.images.extract.pass.noimage',
|
|
564
|
+
taskId: this.taskId,
|
|
565
|
+
data: {
|
|
566
|
+
articleId: article.id,
|
|
567
|
+
},
|
|
568
|
+
});
|
|
569
|
+
continue;
|
|
570
|
+
}
|
|
571
|
+
if (article.imageContextByLlm) {
|
|
572
|
+
this.logger.debug({
|
|
573
|
+
event: 'insights.images.extract.pass.exist',
|
|
574
|
+
taskId: this.taskId,
|
|
575
|
+
data: {
|
|
576
|
+
articleId: article.id,
|
|
577
|
+
},
|
|
578
|
+
});
|
|
579
|
+
continue;
|
|
580
|
+
}
|
|
581
|
+
this.logger.debug({
|
|
582
|
+
event: 'insights.images.extract.start',
|
|
583
|
+
taskId: this.taskId,
|
|
584
|
+
data: {
|
|
585
|
+
count: `${i + 1} / ${this.provider.unscoredArticles.length}`,
|
|
586
|
+
articleId: article.id,
|
|
587
|
+
},
|
|
588
|
+
});
|
|
589
|
+
try {
|
|
590
|
+
const analyzeImages = new AnalyzeImages(this.getLlmQueryConfig(this.provider.analyzeImagesOptions.model, article));
|
|
591
|
+
const imageContextByLlm = await analyzeImages.execute();
|
|
592
|
+
if (imageContextByLlm) {
|
|
593
|
+
articlesWithImageContext.push({
|
|
594
|
+
id: article.id,
|
|
595
|
+
imageContextByLlm,
|
|
596
|
+
});
|
|
597
|
+
this.logger.debug({
|
|
598
|
+
event: 'insights.images.extract.end',
|
|
599
|
+
taskId: this.taskId,
|
|
600
|
+
data: {
|
|
601
|
+
count: `${i + 1} / ${this.provider.unscoredArticles.length}`,
|
|
602
|
+
articleId: article.id,
|
|
603
|
+
},
|
|
604
|
+
});
|
|
605
|
+
}
|
|
606
|
+
else {
|
|
607
|
+
this.logger.debug({
|
|
608
|
+
event: 'insights.images.extract.end.noimage',
|
|
609
|
+
taskId: this.taskId,
|
|
610
|
+
data: {
|
|
611
|
+
count: `${i + 1} / ${this.provider.unscoredArticles.length}`,
|
|
612
|
+
articleId: article.id,
|
|
613
|
+
},
|
|
614
|
+
});
|
|
615
|
+
}
|
|
616
|
+
}
|
|
617
|
+
catch (error) {
|
|
618
|
+
this.logger.debug({
|
|
619
|
+
event: 'insights.images.extract.end.error',
|
|
620
|
+
taskId: this.taskId,
|
|
621
|
+
data: {
|
|
622
|
+
count: `${i + 1} / ${this.provider.unscoredArticles.length}`,
|
|
623
|
+
articleId: article.id,
|
|
624
|
+
error: error instanceof Error ? error.message : String(error),
|
|
625
|
+
},
|
|
626
|
+
});
|
|
627
|
+
// NOTE: Image analysis failure should not interrupt the overall process
|
|
628
|
+
}
|
|
629
|
+
}
|
|
630
|
+
return articlesWithImageContext;
|
|
631
|
+
});
|
|
632
|
+
}
|
|
633
|
+
async mergeTagsAndImageContext(generatedTags, generatedImageContextList) {
|
|
634
|
+
return this.executeWithLogging({
|
|
635
|
+
event: 'insights.context.merge',
|
|
636
|
+
level: 'debug',
|
|
637
|
+
startFields: {
|
|
638
|
+
generatedTags,
|
|
639
|
+
generatedImageContextList,
|
|
640
|
+
},
|
|
641
|
+
doneFields: (count) => ({ count }),
|
|
642
|
+
}, async () => {
|
|
643
|
+
return this.provider.unscoredArticles.map((article) => {
|
|
644
|
+
const articleWithTags = generatedTags.find(({ id }) => id === article.id);
|
|
645
|
+
const articleWithImageContext = generatedImageContextList.find(({ id }) => id === article.id);
|
|
646
|
+
if (articleWithTags) {
|
|
647
|
+
article = {
|
|
648
|
+
...article,
|
|
649
|
+
tag1: articleWithTags.tag1,
|
|
650
|
+
tag2: articleWithTags.tag2,
|
|
651
|
+
tag3: articleWithTags.tag3,
|
|
652
|
+
};
|
|
653
|
+
}
|
|
654
|
+
if (articleWithImageContext) {
|
|
655
|
+
article = {
|
|
656
|
+
...article,
|
|
657
|
+
imageContextByLlm: articleWithImageContext.imageContextByLlm,
|
|
658
|
+
};
|
|
659
|
+
}
|
|
660
|
+
return article;
|
|
661
|
+
});
|
|
662
|
+
});
|
|
663
|
+
}
|
|
664
|
+
async determineImportance(mergedArticles) {
|
|
665
|
+
return this.executeWithLogging({
|
|
666
|
+
event: 'insights.importance.determine',
|
|
667
|
+
level: 'debug',
|
|
668
|
+
startFields: {
|
|
669
|
+
mergedArticles,
|
|
670
|
+
},
|
|
671
|
+
doneFields: (articles) => ({ articles }),
|
|
672
|
+
}, async () => {
|
|
673
|
+
const determinedArticles = [];
|
|
674
|
+
for (const [i, article] of mergedArticles.entries()) {
|
|
675
|
+
this.logger.debug({
|
|
676
|
+
event: 'insights.importance.determine.start',
|
|
677
|
+
taskId: this.taskId,
|
|
678
|
+
data: {
|
|
679
|
+
count: `${i + 1} / ${mergedArticles.length}`,
|
|
680
|
+
articleId: article.id,
|
|
681
|
+
title: article.title?.substring(0, 50) + '...',
|
|
682
|
+
},
|
|
683
|
+
});
|
|
684
|
+
try {
|
|
685
|
+
const determineArticleImportance = new DetermineArticleImportance({
|
|
686
|
+
...this.getLlmQueryConfig(this.provider.determineScoreOptions.model, article),
|
|
687
|
+
minimumImportanceScoreRules: this.provider.determineScoreOptions.minimumImportanceScoreRules,
|
|
688
|
+
dateService: this.dateService,
|
|
689
|
+
});
|
|
690
|
+
const importanceScore = await determineArticleImportance.execute();
|
|
691
|
+
const processedArticle = {
|
|
692
|
+
...article,
|
|
693
|
+
importanceScore,
|
|
694
|
+
};
|
|
695
|
+
// Push result first to avoid losing it if logging fails
|
|
696
|
+
determinedArticles.push(processedArticle);
|
|
697
|
+
// Best-effort logging that won't affect the result
|
|
698
|
+
try {
|
|
699
|
+
this.logger.debug({
|
|
700
|
+
event: 'insights.importance.determine.end',
|
|
701
|
+
taskId: this.taskId,
|
|
702
|
+
data: {
|
|
703
|
+
count: `${i + 1} / ${mergedArticles.length}`,
|
|
704
|
+
articleId: article.id,
|
|
705
|
+
importanceScore: importanceScore,
|
|
706
|
+
},
|
|
707
|
+
});
|
|
708
|
+
}
|
|
709
|
+
catch {
|
|
710
|
+
// ignore logging errors
|
|
711
|
+
}
|
|
712
|
+
}
|
|
713
|
+
catch (error) {
|
|
714
|
+
// Log error but ensure we still return a fallback score
|
|
715
|
+
try {
|
|
716
|
+
this.logger.debug({
|
|
717
|
+
event: 'insights.importance.determine.end.error',
|
|
718
|
+
taskId: this.taskId,
|
|
719
|
+
data: {
|
|
720
|
+
count: `${i + 1} / ${mergedArticles.length}`,
|
|
721
|
+
articleId: article.id,
|
|
722
|
+
error: error instanceof Error ? error.message : String(error),
|
|
723
|
+
},
|
|
724
|
+
});
|
|
725
|
+
}
|
|
726
|
+
catch {
|
|
727
|
+
// ignore logging errors
|
|
728
|
+
}
|
|
729
|
+
// NOTE: Importance analysis failure should not stop the pipeline; use a fallback score instead
|
|
730
|
+
determinedArticles.push({
|
|
731
|
+
...article,
|
|
732
|
+
importanceScore: 1, // Set to minimum importance as a sane default
|
|
733
|
+
});
|
|
734
|
+
}
|
|
735
|
+
}
|
|
736
|
+
return determinedArticles;
|
|
737
|
+
});
|
|
738
|
+
}
|
|
739
|
+
getLlmQueryConfig(model, targetArticle) {
|
|
740
|
+
return {
|
|
741
|
+
model,
|
|
742
|
+
logger: this.logger,
|
|
743
|
+
taskId: this.taskId,
|
|
744
|
+
targetArticle: targetArticle,
|
|
745
|
+
options: pick(this.options, ['content', 'llm']),
|
|
746
|
+
loggingExecutor: new LoggingExecutor(this.logger, this.taskId),
|
|
747
|
+
};
|
|
748
|
+
}
|
|
749
|
+
}
|
|
750
|
+
|
|
751
|
+
class AnalysisChain extends Chain {
|
|
752
|
+
dateService;
|
|
753
|
+
constructor(config) {
|
|
754
|
+
super(config);
|
|
755
|
+
this.dateService = config.dateService;
|
|
756
|
+
}
|
|
757
|
+
get chain() {
|
|
758
|
+
return RunnablePassthrough.assign({
|
|
759
|
+
unscoredArticles: () => this.fetchUnscoredArticles(),
|
|
760
|
+
tags: () => this.fetchTags(),
|
|
761
|
+
})
|
|
762
|
+
.pipe(RunnablePassthrough.assign({
|
|
763
|
+
determinedArticles: ({ unscoredArticles, tags }) => this.analyzeArticles(unscoredArticles, tags),
|
|
764
|
+
}))
|
|
765
|
+
.pipe(RunnablePassthrough.assign({
|
|
766
|
+
processedCount: ({ determinedArticles }) => this.updateAnalysisContext(determinedArticles),
|
|
767
|
+
}))
|
|
768
|
+
.withRetry({ stopAfterAttempt: this.options.chain.stopAfterAttempt });
|
|
769
|
+
}
|
|
770
|
+
async fetchUnscoredArticles() {
|
|
771
|
+
return this.executeWithLogging({
|
|
772
|
+
event: 'analysis.articles.fetch',
|
|
773
|
+
level: 'debug',
|
|
774
|
+
doneFields: (items) => ({ count: items.length }),
|
|
775
|
+
}, async () => {
|
|
776
|
+
return await this.provider.fetchUnscoredArticles();
|
|
777
|
+
});
|
|
778
|
+
}
|
|
779
|
+
async fetchTags() {
|
|
780
|
+
return this.executeWithLogging({
|
|
781
|
+
event: 'analysis.tags.fetch',
|
|
782
|
+
level: 'debug',
|
|
783
|
+
doneFields: (items) => ({ count: items.length }),
|
|
784
|
+
}, async () => {
|
|
785
|
+
return await this.provider.fetchTags();
|
|
786
|
+
});
|
|
787
|
+
}
|
|
788
|
+
async analyzeArticles(unscoredArticles, tags) {
|
|
789
|
+
return this.executeWithLogging({
|
|
790
|
+
event: 'analysis.articles.analyze',
|
|
791
|
+
level: 'debug',
|
|
792
|
+
startFields: {
|
|
793
|
+
unscoredArticles,
|
|
794
|
+
tags,
|
|
795
|
+
},
|
|
796
|
+
doneFields: (items) => ({ count: items.length }),
|
|
797
|
+
}, async () => {
|
|
798
|
+
const articleInsightsChain = new ArticleInsightsChain({
|
|
799
|
+
logger: this.logger,
|
|
800
|
+
taskId: this.taskId,
|
|
801
|
+
provider: {
|
|
802
|
+
unscoredArticles,
|
|
803
|
+
tags,
|
|
804
|
+
classifyTagOptions: this.provider.classifyTagOptions,
|
|
805
|
+
analyzeImagesOptions: this.provider.analyzeImagesOptions,
|
|
806
|
+
determineScoreOptions: this.provider.determineScoreOptions,
|
|
807
|
+
},
|
|
808
|
+
options: this.options,
|
|
809
|
+
loggingExecutor: new LoggingExecutor(this.logger, this.taskId),
|
|
810
|
+
dateService: this.dateService,
|
|
811
|
+
});
|
|
812
|
+
return await articleInsightsChain.generateInsights();
|
|
813
|
+
});
|
|
814
|
+
}
|
|
815
|
+
async updateAnalysisContext(determinedArticles) {
|
|
816
|
+
return this.executeWithLogging({
|
|
817
|
+
event: 'analysis.articles.update',
|
|
818
|
+
level: 'debug',
|
|
819
|
+
startFields: {
|
|
820
|
+
determinedArticles,
|
|
821
|
+
},
|
|
822
|
+
doneFields: (count) => ({ count }),
|
|
823
|
+
}, async () => {
|
|
824
|
+
for (let i = 0; i < determinedArticles.length; i++) {
|
|
825
|
+
const article = determinedArticles[i];
|
|
826
|
+
await this.provider.update(article);
|
|
827
|
+
}
|
|
828
|
+
return determinedArticles.length;
|
|
829
|
+
});
|
|
830
|
+
}
|
|
831
|
+
}
|
|
832
|
+
|
|
833
|
+
function markdownToHtml(markdown) {
|
|
834
|
+
const html = marked.parse(markdown);
|
|
835
|
+
const window = new JSDOM('').window;
|
|
836
|
+
const purify = DOMPurify(window);
|
|
837
|
+
const sanitized = purify.sanitize(html);
|
|
838
|
+
const withTargetBlank = addTargetBlankToAnchors(sanitized);
|
|
839
|
+
const withDelReplaced = replaceDelTagsWithTilde(withTargetBlank);
|
|
840
|
+
return correctUnconvertedBoldSyntax(withDelReplaced);
|
|
841
|
+
}
|
|
842
|
+
function addTargetBlankToAnchors(htmlString) {
|
|
843
|
+
// Regular expression to find '<a>' tags
|
|
844
|
+
// This regex matches '<a>' tags that contain 'href' attribute and optionally other attributes
|
|
845
|
+
// Excludes 'target="[^"]*"' to check if target attribute already exists
|
|
846
|
+
const regex = /<a(\s+[^>]*?)?(?<!target="[^"]*")>/gi;
|
|
847
|
+
// Use regex to find '<a>' tags and add 'target="_blank"'
|
|
848
|
+
return htmlString.replace(regex, (_match, attributes) => {
|
|
849
|
+
// Handle undefined attributes as empty string
|
|
850
|
+
const currentAttributes = attributes || '';
|
|
851
|
+
// Double check if target attribute exists (safety check for regex limitations)
|
|
852
|
+
if (currentAttributes.includes('target=')) {
|
|
853
|
+
return `<a${currentAttributes}>`; // If target attribute exists, return without modification
|
|
854
|
+
}
|
|
855
|
+
else {
|
|
856
|
+
// Add target="_blank" attribute
|
|
857
|
+
return `<a${currentAttributes} target="_blank">`;
|
|
858
|
+
}
|
|
859
|
+
});
|
|
860
|
+
}
|
|
861
|
+
function replaceDelTagsWithTilde(htmlString) {
|
|
862
|
+
// Replace opening and closing del tags with tilde (~)
|
|
863
|
+
return htmlString.replace(/<del>/gi, '~').replace(/<\/del>/gi, '~');
|
|
864
|
+
}
|
|
865
|
+
function correctUnconvertedBoldSyntax(htmlString) {
|
|
866
|
+
// Replace unconverted "**text**" markdown syntax with <b> tags
|
|
867
|
+
// Matches "**" followed by one or more non-asterisk characters, followed by "**"
|
|
868
|
+
return htmlString.replace(/\*\*([^*]+)\*\*/g, '<b>$1</b>');
|
|
869
|
+
}
|
|
870
|
+
|
|
871
|
+
let GenerateNewsletter$1 = class GenerateNewsletter extends BaseLLMQuery {
|
|
872
|
+
maxOutputTokens;
|
|
873
|
+
temperature;
|
|
874
|
+
topP;
|
|
875
|
+
topK;
|
|
876
|
+
presencePenalty;
|
|
877
|
+
frequencyPenalty;
|
|
878
|
+
targetArticles;
|
|
879
|
+
dateService;
|
|
880
|
+
subscribePageUrl;
|
|
881
|
+
newsletterBrandName;
|
|
882
|
+
schema = z.object({
|
|
883
|
+
title: z
|
|
884
|
+
.string()
|
|
885
|
+
.max(100)
|
|
886
|
+
.min(20)
|
|
887
|
+
.describe('Title of the newsletter email'),
|
|
888
|
+
content: z.string().describe('Email content in markdown format'),
|
|
889
|
+
isWrittenInOutputLanguage: z
|
|
890
|
+
.boolean()
|
|
891
|
+
.describe(`Whether the content is written in ${this.options.content.outputLanguage}`),
|
|
892
|
+
copyrightVerified: z
|
|
893
|
+
.boolean()
|
|
894
|
+
.describe('Verification status of copyright compliance (true: verified, false: potential violation)'),
|
|
895
|
+
factAccuracy: z
|
|
896
|
+
.boolean()
|
|
897
|
+
.describe('Verification of fact-based content from provided data (true: facts only, false: contains unsupported content)'),
|
|
898
|
+
});
|
|
899
|
+
constructor(config) {
|
|
900
|
+
super(config);
|
|
901
|
+
this.maxOutputTokens = config.maxOutputTokens;
|
|
902
|
+
this.temperature = config.temperature ?? 0.3;
|
|
903
|
+
this.topP = config.topP;
|
|
904
|
+
this.topK = config.topK;
|
|
905
|
+
this.presencePenalty = config.presencePenalty;
|
|
906
|
+
this.frequencyPenalty = config.frequencyPenalty;
|
|
907
|
+
this.targetArticles = config.targetArticles;
|
|
908
|
+
this.dateService = config.dateService;
|
|
909
|
+
this.subscribePageUrl = config.subscribePageUrl;
|
|
910
|
+
this.newsletterBrandName = config.newsletterBrandName;
|
|
911
|
+
}
|
|
912
|
+
async execute() {
|
|
913
|
+
const { object } = await generateObject({
|
|
914
|
+
model: this.model,
|
|
915
|
+
maxRetries: this.options.llm.maxRetries,
|
|
916
|
+
maxOutputTokens: this.maxOutputTokens,
|
|
917
|
+
temperature: this.temperature,
|
|
918
|
+
topP: this.topP,
|
|
919
|
+
topK: this.topK,
|
|
920
|
+
presencePenalty: this.presencePenalty,
|
|
921
|
+
frequencyPenalty: this.frequencyPenalty,
|
|
922
|
+
schema: this.schema,
|
|
923
|
+
system: this.systemPrompt,
|
|
924
|
+
prompt: this.userPrompt,
|
|
925
|
+
});
|
|
926
|
+
if (!object.isWrittenInOutputLanguage) {
|
|
927
|
+
return this.execute();
|
|
928
|
+
}
|
|
929
|
+
if (!object.copyrightVerified) {
|
|
930
|
+
return this.execute();
|
|
931
|
+
}
|
|
932
|
+
if (!object.factAccuracy) {
|
|
933
|
+
return this.execute();
|
|
934
|
+
}
|
|
935
|
+
return pick(object, ['title', 'content']);
|
|
936
|
+
}
|
|
937
|
+
get systemPrompt() {
|
|
938
|
+
return `You are a newsletter production expert for "${this.newsletterBrandName}" who analyzes and delivers trends in the fields of ${this.expertFields.join(', ')}. Your goal is to provide in-depth analysis that helps industry professionals easily understand complex information and make informed decisions.
|
|
939
|
+
|
|
940
|
+
Important rule for displaying date ranges: When displaying date ranges, you must use a hyphen (-) instead of a tilde (~). For example, use 'June 1-2, 2025' instead of 'June 1~2, 2025'. The tilde (~) can be rendered as strikethrough in markdown.
|
|
941
|
+
|
|
942
|
+
**Key Principles for Preventing Hallucination:**
|
|
943
|
+
1. **Fact-Based Writing**: Use only content explicitly stated in the provided sources, do not expand through inference or speculation.
|
|
944
|
+
2. **Accurate Citation**: Use expressions directly from the sources without arbitrarily interpreting or elaborating on meanings.
|
|
945
|
+
3. **Conservative Approach**: Do not mention uncertain or ambiguous content, or express it very cautiously.
|
|
946
|
+
4. **Verifiable Information**: All information must be directly verifiable from the provided sources.
|
|
947
|
+
5. **No Speculation**: Do not use speculative expressions like "appears to be" or "is expected to".
|
|
948
|
+
6. **No Fictional Standards/Policies**: Do not mention non-existent standards/policies or systems incorrectly reported as planned for future implementation.
|
|
949
|
+
|
|
950
|
+
Roles:
|
|
951
|
+
- Friendly Guide: Deliver information like a trusted colleague rather than a rigid expert. Use appropriate emoticons in titles and section headings to improve readability.
|
|
952
|
+
- Information Integrator: Group similar topics or related news to show broader context and trends. Focus on connections between news items rather than individual stories, and explain patterns based on data.
|
|
953
|
+
- Credibility Builder: All information must be provided with sources. Whenever specific content or titles are mentioned in the body, links must be provided in [original title](URL) format. Understand that source citation is not just formal but a key element in enhancing newsletter credibility and accessibility.
|
|
954
|
+
- Fact Checker: Use only facts from provided source materials. Do not make unsubstantiated claims or speculate beyond the materials.
|
|
955
|
+
|
|
956
|
+
**Important Prohibitions:**
|
|
957
|
+
- Do not bundle or omit structured list items (permits/reports/notices etc.) with "... and n more" etc. (tables must list all items in individual rows).
|
|
958
|
+
- Do not describe policies or plans of governments/organizations/companies not explicitly mentioned in sources as facts.
|
|
959
|
+
- Do not mention unconfirmed future plans or non-existent standards/policies.
|
|
960
|
+
- Do not add details not present in source materials.
|
|
961
|
+
|
|
962
|
+
**Content Organization Principles:**
|
|
963
|
+
- Use only accurate content from provided sources
|
|
964
|
+
- No additional details or specific interpretations beyond source materials
|
|
965
|
+
- All information must be verifiable and traceable
|
|
966
|
+
- Focus on clear facts rather than inference or speculation
|
|
967
|
+
- Exclude uncertain content and include only confirmed information
|
|
968
|
+
- For tables, include as much identifying information as possible, but mark "—" if not in source
|
|
969
|
+
- No arbitrary estimation of domain-specific procedures or schedules (state only confirmed facts)
|
|
970
|
+
|
|
971
|
+
Importance Score Criteria (1-10 points, expressed as stars):
|
|
972
|
+
- ★★★★★ (9-10 points): [Very Important] Laws/regulations that can change industry landscape, large budgets/investments, groundbreaking research/technology announcements that all professionals must know and prepare immediate responses for. These require immediate action and direct changes to business strategy.
|
|
973
|
+
- ★★★★☆ (7-8 points): [Important] Major policy changes, large projects/programs, important research/product announcements affecting specific fields or multiple organizations that should be referenced for key decisions. These need action soon and affect mid-term planning.
|
|
974
|
+
- ★★★☆☆ (5-6 points): [Reference] Medium-scale projects, services, approvals, major events/campaigns affecting specific regions or organizations. These are changes that professionals in the field should know about.
|
|
975
|
+
- ★★☆☆☆ (3-4 points): General industry trends, small events, routine permits/reports that are good to know. No direct action needed but helpful for trend awareness.
|
|
976
|
+
- ★☆☆☆☆ (1-2 points): Simple information sharing or repetitive news. Just for reference.
|
|
977
|
+
|
|
978
|
+
Copyright Protection & Fact-Checking Principles:
|
|
979
|
+
- Extract only factual information from sources, completely exclude creative expressions
|
|
980
|
+
- When constructing new sentences from extracted facts, do not follow source structure
|
|
981
|
+
- Review for remaining source expressions after writing and modify to dry style
|
|
982
|
+
- Do not present content not specified in provided materials as fact
|
|
983
|
+
- Analysis and insights must be data-based; avoid baseless predictions or claims
|
|
984
|
+
- If information is uncertain or requires speculation, clearly use phrases like "is estimated" or "may be possible"
|
|
985
|
+
|
|
986
|
+
Output Format & Requirements:
|
|
987
|
+
1. Language: ${this.options.content.outputLanguage}
|
|
988
|
+
|
|
989
|
+
2. Start: Specify date (${this.dateService.getDisplayDateString()}) and begin with neutral, objective greeting. Briefly introduce key factual information to be covered in today's newsletter.
|
|
990
|
+
|
|
991
|
+
3. Overall Briefing: Before the main listing, create a briefing section conveying objective facts about today's news in these aspects:
|
|
992
|
+
- Key Trends: Explain major patterns or trends found in this news based on data. Ex: 'Over 00% of today's news relates to 00'.
|
|
993
|
+
- Immediate Impact: Emphasize most important changes or decisions affecting industry immediately, specifically mentioning which fields will be most impacted.
|
|
994
|
+
|
|
995
|
+
4. Category Classification & Content Organization:
|
|
996
|
+
- Group news by logical categories based on related tags and content (e.g., Policy/Regulation, Budget/Support, Research/Development, Products/Services, Operations/Process, Recruitment/Events) rather than just listing by importance.
|
|
997
|
+
- Use appropriate emoticons for each category for visual distinction.
|
|
998
|
+
- Sort by importance within categories, making high-importance items more prominent.
|
|
999
|
+
- Add short paragraph at category start summarizing overall trends or changes in that area, specifying important points and areas to focus on.
|
|
1000
|
+
- Group similar news together for joint analysis when multiple related items exist.
|
|
1001
|
+
- When content is essentially identical (e.g., same job posting, event notice, announcement) from different sources, integrate around most detailed and accurate information without duplication.
|
|
1002
|
+
- Use tables when helpful to show commonalities and differences between multiple items at a glance.
|
|
1003
|
+
- Always provide links in [original title](URL) format whenever article titles or content are mentioned. Do not write as general text like "View" or "Article" or numbered references like [Post3](URL).
|
|
1004
|
+
|
|
1005
|
+
5. Detailed Content Writing Guidelines (Importance-Based Length Control):
|
|
1006
|
+
**ABSOLUTE RULE: The writing length limits below are MAXIMUM constraints. DO NOT EXCEED these limits under any circumstances.**
|
|
1007
|
+
|
|
1008
|
+
- **Tier 1 (9-10 points) - Full Detail Allowed:**
|
|
1009
|
+
- Key Facts: 1-2 sentences in **bold** with source link [original title](URL).
|
|
1010
|
+
- Related Targets & Scope: Bullet points for different target groups.
|
|
1011
|
+
- Important Dates & Procedures: Deadlines, methods, required documents by step.
|
|
1012
|
+
- Related Facts: Budget/scale/participants/scope as factual data.
|
|
1013
|
+
- Use tables when comparing multiple items.
|
|
1014
|
+
|
|
1015
|
+
- **Tier 2 (6-8 points) - ABSOLUTE MAXIMUM 3 SENTENCES:**
|
|
1016
|
+
- Format: ONE sentence with **bold** key fact + [original title](URL) link. OPTIONALLY add ONE more sentence with critical detail (deadline/budget/target). NEVER EXCEED 3 SENTENCES TOTAL.
|
|
1017
|
+
- DO NOT write bullet points, DO NOT write multiple paragraphs, DO NOT add subsections.
|
|
1018
|
+
|
|
1019
|
+
- **Tier 3 (1-5 points) - ABSOLUTE MAXIMUM 1 SENTENCE:**
|
|
1020
|
+
- Format: ONE single sentence with core fact + [original title](URL) link. PERIOD. NO ADDITIONAL SENTENCES.
|
|
1021
|
+
- Multiple low-priority items can be grouped into a single bullet list.
|
|
1022
|
+
|
|
1023
|
+
**VIOLATION WARNING: If you write more than the maximum sentences allowed for Tier 2 or Tier 3, the output will be rejected and you must regenerate.**
|
|
1024
|
+
|
|
1025
|
+
- **Structured Lists (Permits/Reports/Notices):** Create tables listing every item in individual rows without abbreviation, regardless of importance score.
|
|
1026
|
+
- Use professional but friendly tone that's easy to understand. (Ex: Use "is notable" instead of "is", "is recommended" instead of "must", "needs to" etc.)
|
|
1027
|
+
- Can use blockquotes to highlight expert comments or particularly emphasized insights.
|
|
1028
|
+
|
|
1029
|
+
6. Closing: Write objective closing including:
|
|
1030
|
+
- Brief summary of key factual information covered today.
|
|
1031
|
+
- Objectively list ongoing important schedules or imminent deadlines.
|
|
1032
|
+
- Maintain neutral and objective tone.
|
|
1033
|
+
- Do not write preview or anticipatory messages about next newsletter.
|
|
1034
|
+
- Do not include contact information for inquiries.
|
|
1035
|
+
|
|
1036
|
+
7. Title Writing Guidelines:
|
|
1037
|
+
- Title should objectively convey core facts of 1-2 most important news items today.
|
|
1038
|
+
- Write with key facts rather than simple "Newsletter", more effective with specific figures or schedules.
|
|
1039
|
+
- Use neutral and objective terms in title (e.g., 'announced', 'implementing', 'deadline approaching').
|
|
1040
|
+
- Keep title length 20-50 characters and can include 1-2 relevant emoticons.
|
|
1041
|
+
- Place most important key facts at beginning of title.
|
|
1042
|
+
- Write title clearly and factually to maintain professionalism and credibility.
|
|
1043
|
+
|
|
1044
|
+
8. Additional Requirements:
|
|
1045
|
+
- Comprehensively analyze posts to create email containing most important information for ${this.expertFields.join(', ')} field experts.
|
|
1046
|
+
- Naturally include date at beginning in the format: "${this.dateService.getDisplayDateString()} ${this.expertFields.join(', ')} [News Term]". Replace [News Term] with the word for "News" appropriate for the output language (e.g., "News" for English, "소식" for Korean). Declare this part as \`Heading 1\`(#).
|
|
1047
|
+
- Write body in markdown format, effectively using headings(#, ##, ###), bold(**), italics(_), bullet points(-, *) etc. to improve readability.
|
|
1048
|
+
- Group related news to provide broader context, and mention development status if there's continuity with content covered in previous issues.
|
|
1049
|
+
- **Source citation is most important for ensuring credibility.** Must provide links in [original title](URL) format using source's title. Do not write as "View", "Article", "[Post3](URL)" format.
|
|
1050
|
+
- Specify source whenever article titles or content are quoted in newsletter, ensure all information is provided with links.
|
|
1051
|
+
- Discover connections and patterns between news items to provide integrated insights rather than simple listing, and provide data-based insightful analysis.
|
|
1052
|
+
- Structure entire content so experts can quickly scan and grasp key information, design so busy experts can understand most important content within 2-3 minutes.
|
|
1053
|
+
- Including simple small data analysis (e.g., "00% of this news budget-related", "30% increase in 00-related news vs last week") adds more valuable insight where possible.
|
|
1054
|
+
${this.subscribePageUrl ? `- Add \`${this.subscribePageUrl}\`(Subscribe to ${this.newsletterBrandName}) page access link button at appropriate attention-worthy spot for natural recommendation to others.` : ''}`;
|
|
1055
|
+
}
|
|
1056
|
+
get userPrompt() {
|
|
1057
|
+
return `Below is the complete list of newly collected ${this.expertFields.join(', ')} related news:
|
|
1058
|
+
|
|
1059
|
+
${this.targetArticles
|
|
1060
|
+
.map((post, index) => `## Post ${index + 1}
|
|
1061
|
+
**Title:** ${post.title}
|
|
1062
|
+
**Content:** ${post.detailContent}
|
|
1063
|
+
**Importance:** ${post.importanceScore}/10
|
|
1064
|
+
**Tags:** ${[post.tag1, post.tag2, post.tag3].filter(Boolean).join(', ')}
|
|
1065
|
+
**Content Type:** ${post.contentType}
|
|
1066
|
+
**URL:** ${post.url}
|
|
1067
|
+
${post.imageContextByLlm ? `**Image Analysis:** ${post.imageContextByLlm}` : ''}
|
|
1068
|
+
`)
|
|
1069
|
+
.join('\n\n')}
|
|
1070
|
+
|
|
1071
|
+
|
|
1072
|
+
---
|
|
1073
|
+
**Comprehensive Analysis and Daily Newsletter Generation Request:**
|
|
1074
|
+
Based on all post information provided above, please generate a ${this.expertFields.join(', ')} trends newsletter for ${this.dateService.getDisplayDateString()}. Please note the following:
|
|
1075
|
+
|
|
1076
|
+
1. **STRICT LENGTH CONTROL BY IMPORTANCE SCORE:**
|
|
1077
|
+
- 9-10 points: Full detailed coverage allowed (Key Facts + Targets + Dates + Related Facts)
|
|
1078
|
+
- 6-8 points: MAXIMUM 3 SENTENCES ONLY. Do not write detailed analysis, subsections, or bullet points.
|
|
1079
|
+
- 1-5 points: MAXIMUM 1 SENTENCE ONLY. Just core fact + link.
|
|
1080
|
+
|
|
1081
|
+
2. Prioritize high importance items (9-10 points) first and structure information by importance and topic.
|
|
1082
|
+
3. Instead of simply listing news items, group similar topics to strengthen information connectivity. For duplicate content from different sources (e.g., same job posting, same event notice), minimize redundancy by mentioning once or grouping around the most detailed content.
|
|
1083
|
+
4. Sort by highest importance within categories and analyze category trends and patterns to assess industry-wide impact.
|
|
1084
|
+
5. Include 1-2 most important news items in the title and use appropriate emoticons to enhance visual readability.
|
|
1085
|
+
6. Structure the final output so experts can quickly grasp key information and provide insights that aid practical decision-making.
|
|
1086
|
+
7. Source citation is crucial for credibility. Whenever mentioning any news, always provide links in [original title](URL) format. Do not use formats like "View Details" or "Post3". Content must always be accompanied by source links.
|
|
1087
|
+
8. Never create content not present in the provided materials. All analysis and insights must be based strictly on the provided post materials, without adding arbitrary information or presenting it as fact.
|
|
1088
|
+
9. Structured List (Permits/Reports/Notices etc.) Table Formatting Principles (No Omissions/Abbreviations):
|
|
1089
|
+
- Do not bundle any items in this category; never abbreviate with "... and n more", "others", etc.
|
|
1090
|
+
- List all items in tables with one item per row. Do not shorten tables regardless of length.
|
|
1091
|
+
- Each row must include [original title](URL) and when available, list the following fields in order: Organization (or Publisher) | Region/Basin | Number/ID (Permit/Receipt/Report number etc.) | Date (Post/Issue/Permit date as shown in source).
|
|
1092
|
+
- Mark missing values as "—" rather than leaving blank or omitting columns.
|
|
1093
|
+
- Maintain table format without using bullet/number lists.
|
|
1094
|
+
|
|
1095
|
+
Please follow the roles and output format defined in the system prompt (friendly introduction, overall briefing, category classification, in-depth analysis, polite closing, etc.).`;
|
|
1096
|
+
}
|
|
1097
|
+
};
|
|
1098
|
+
|
|
1099
|
+
class ContentGenerateChain extends Chain {
|
|
1100
|
+
dateService;
|
|
1101
|
+
minimumArticleCountForIssue;
|
|
1102
|
+
priorityArticleScoreThreshold;
|
|
1103
|
+
htmlTemplate;
|
|
1104
|
+
constructor(config) {
|
|
1105
|
+
super(config);
|
|
1106
|
+
this.dateService = config.dateService;
|
|
1107
|
+
this.minimumArticleCountForIssue =
|
|
1108
|
+
config.provider.publicationCriteria?.minimumArticleCountForIssue ?? 5;
|
|
1109
|
+
this.priorityArticleScoreThreshold =
|
|
1110
|
+
config.provider.publicationCriteria?.priorityArticleScoreThreshold ?? 8;
|
|
1111
|
+
this.htmlTemplate = {
|
|
1112
|
+
html: config.provider.htmlTemplate.html,
|
|
1113
|
+
markers: {
|
|
1114
|
+
title: config.provider.htmlTemplate.markers?.title ?? 'title',
|
|
1115
|
+
content: config.provider.htmlTemplate.markers?.content ?? 'content',
|
|
1116
|
+
},
|
|
1117
|
+
};
|
|
1118
|
+
}
|
|
1119
|
+
get chain() {
|
|
1120
|
+
return RunnablePassthrough.assign({
|
|
1121
|
+
candidateArticles: () => this.fetchArticleCandidates(),
|
|
1122
|
+
})
|
|
1123
|
+
.pipe(RunnablePassthrough.assign({
|
|
1124
|
+
generatedCoreContent: ({ candidateArticles }) => this.generateCoreContent(candidateArticles),
|
|
1125
|
+
candidateArticles: ({ candidateArticles }) => candidateArticles,
|
|
1126
|
+
}))
|
|
1127
|
+
.pipe(RunnablePassthrough.assign({
|
|
1128
|
+
html: ({ generatedCoreContent }) => this.renderHtml(generatedCoreContent),
|
|
1129
|
+
generatedCoreContent: ({ generatedCoreContent }) => generatedCoreContent,
|
|
1130
|
+
candidateArticles: ({ candidateArticles }) => candidateArticles,
|
|
1131
|
+
}))
|
|
1132
|
+
.pipe(RunnablePassthrough.assign({
|
|
1133
|
+
newsletterId: ({ html, generatedCoreContent, candidateArticles }) => this.createNewsletter(html, generatedCoreContent, candidateArticles),
|
|
1134
|
+
}))
|
|
1135
|
+
.withRetry({ stopAfterAttempt: this.options.chain.stopAfterAttempt });
|
|
1136
|
+
}
|
|
1137
|
+
async fetchArticleCandidates() {
|
|
1138
|
+
return this.executeWithLogging({
|
|
1139
|
+
event: 'generate.content.articles.fetch',
|
|
1140
|
+
level: 'debug',
|
|
1141
|
+
doneFields: (items) => ({ count: items.length }),
|
|
1142
|
+
}, async () => {
|
|
1143
|
+
return await this.provider.fetchArticleCandidates();
|
|
1144
|
+
});
|
|
1145
|
+
}
|
|
1146
|
+
async generateCoreContent(candidateArticles) {
|
|
1147
|
+
return this.executeWithLogging({
|
|
1148
|
+
event: 'generate.content.core.generate',
|
|
1149
|
+
level: 'info',
|
|
1150
|
+
startFields: { count: candidateArticles.length },
|
|
1151
|
+
doneFields: (result) => ({ title: result?.title }),
|
|
1152
|
+
}, async () => {
|
|
1153
|
+
if (candidateArticles.length === 0) {
|
|
1154
|
+
this.logger.debug({
|
|
1155
|
+
event: 'generate.content.core.generate.noarticle',
|
|
1156
|
+
taskId: this.taskId,
|
|
1157
|
+
});
|
|
1158
|
+
return null;
|
|
1159
|
+
}
|
|
1160
|
+
const hasHighImportancePost = candidateArticles.some(({ importanceScore }) => importanceScore >= this.priorityArticleScoreThreshold);
|
|
1161
|
+
if (candidateArticles.length <= this.minimumArticleCountForIssue &&
|
|
1162
|
+
!hasHighImportancePost) {
|
|
1163
|
+
this.logger.debug({
|
|
1164
|
+
event: 'generate.content.core.generate.criteria',
|
|
1165
|
+
taskId: this.taskId,
|
|
1166
|
+
data: {
|
|
1167
|
+
count: candidateArticles.length,
|
|
1168
|
+
hasHighImportancePost,
|
|
1169
|
+
},
|
|
1170
|
+
});
|
|
1171
|
+
return null;
|
|
1172
|
+
}
|
|
1173
|
+
const generateNewsletter = new GenerateNewsletter$1({
|
|
1174
|
+
model: this.provider.model,
|
|
1175
|
+
maxOutputTokens: this.provider.maxOutputTokens,
|
|
1176
|
+
temperature: this.provider.temperature,
|
|
1177
|
+
topP: this.provider.topP,
|
|
1178
|
+
topK: this.provider.topK,
|
|
1179
|
+
presencePenalty: this.provider.presencePenalty,
|
|
1180
|
+
frequencyPenalty: this.provider.frequencyPenalty,
|
|
1181
|
+
logger: this.logger,
|
|
1182
|
+
taskId: this.taskId,
|
|
1183
|
+
targetArticles: candidateArticles,
|
|
1184
|
+
options: pick(this.options, ['content', 'llm']),
|
|
1185
|
+
loggingExecutor: new LoggingExecutor(this.logger, this.taskId),
|
|
1186
|
+
subscribePageUrl: this.provider.subscribePageUrl,
|
|
1187
|
+
newsletterBrandName: this.provider.newsletterBrandName,
|
|
1188
|
+
dateService: this.dateService,
|
|
1189
|
+
});
|
|
1190
|
+
return await generateNewsletter.execute();
|
|
1191
|
+
});
|
|
1192
|
+
}
|
|
1193
|
+
async renderHtml(coreContent) {
|
|
1194
|
+
return this.executeWithLogging({
|
|
1195
|
+
event: 'generate.content.html.render',
|
|
1196
|
+
level: 'debug',
|
|
1197
|
+
startFields: { coreContent },
|
|
1198
|
+
doneFields: (html) => ({ html }),
|
|
1199
|
+
}, async () => {
|
|
1200
|
+
if (!coreContent) {
|
|
1201
|
+
return null;
|
|
1202
|
+
}
|
|
1203
|
+
return this.htmlTemplate.html
|
|
1204
|
+
.replaceAll(`{{${this.htmlTemplate.markers.title}}}`, coreContent.title)
|
|
1205
|
+
.replaceAll(`{{${this.htmlTemplate.markers.content}}}`, markdownToHtml(coreContent.content));
|
|
1206
|
+
});
|
|
1207
|
+
}
|
|
1208
|
+
async createNewsletter(html, coreContent, candidateArticles) {
|
|
1209
|
+
return this.executeWithLogging({
|
|
1210
|
+
event: 'generate.content.newsletter.create',
|
|
1211
|
+
level: 'debug',
|
|
1212
|
+
startFields: { html, count: candidateArticles.length },
|
|
1213
|
+
doneFields: (id) => ({ id }),
|
|
1214
|
+
}, async () => {
|
|
1215
|
+
if (!html || !coreContent) {
|
|
1216
|
+
return null;
|
|
1217
|
+
}
|
|
1218
|
+
const { id } = await this.provider.saveNewsletter({
|
|
1219
|
+
newsletter: {
|
|
1220
|
+
...coreContent,
|
|
1221
|
+
htmlBody: juice(html),
|
|
1222
|
+
issueOrder: this.provider.issueOrder,
|
|
1223
|
+
date: this.dateService.getCurrentISODateString(),
|
|
1224
|
+
},
|
|
1225
|
+
usedArticles: candidateArticles,
|
|
1226
|
+
});
|
|
1227
|
+
return id;
|
|
1228
|
+
});
|
|
1229
|
+
}
|
|
1230
|
+
}
|
|
1231
|
+
|
|
1232
|
+
// User-Agent list used by real browsers
|
|
1233
|
+
const USER_AGENTS = [
|
|
1234
|
+
// Windows - Chrome, Edge, Firefox
|
|
1235
|
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36',
|
|
1236
|
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0',
|
|
1237
|
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:126.0) Gecko/20100101 Firefox/126.0',
|
|
1238
|
+
// macOS - Chrome, Safari, Firefox
|
|
1239
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36',
|
|
1240
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.5 Safari/605.1.15',
|
|
1241
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:126.0) Gecko/20100101 Firefox/126.0',
|
|
1242
|
+
// Linux - Chrome, Firefox
|
|
1243
|
+
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36',
|
|
1244
|
+
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:126.0) Gecko/20100101 Firefox/126.0',
|
|
1245
|
+
// Additional common combinations
|
|
1246
|
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36',
|
|
1247
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36',
|
|
1248
|
+
];
|
|
1249
|
+
// Pick a random User-Agent
|
|
1250
|
+
const getRandomUserAgent = () => USER_AGENTS[Math.floor(Math.random() * USER_AGENTS.length)];
|
|
1251
|
+
const sleep = (ms) => new Promise((resolve) => setTimeout(resolve, ms));
|
|
1252
|
+
const clamp = (n, min, max) => Math.max(min, Math.min(max, n));
|
|
1253
|
+
function parseRetryAfter(header) {
|
|
1254
|
+
if (!header)
|
|
1255
|
+
return null;
|
|
1256
|
+
// Seconds value
|
|
1257
|
+
const seconds = Number(header);
|
|
1258
|
+
if (!Number.isNaN(seconds))
|
|
1259
|
+
return clamp(seconds * 1000, 0, 60_000);
|
|
1260
|
+
// HTTP date format
|
|
1261
|
+
const date = new Date(header);
|
|
1262
|
+
const diff = date.getTime() - Date.now();
|
|
1263
|
+
if (Number.isFinite(diff) && diff > 0)
|
|
1264
|
+
return clamp(diff, 0, 60_000);
|
|
1265
|
+
return null;
|
|
1266
|
+
}
|
|
1267
|
+
function shouldRetry(status, error) {
|
|
1268
|
+
if (status === 429)
|
|
1269
|
+
return true; // Too Many Requests (429)
|
|
1270
|
+
if (status && status >= 500)
|
|
1271
|
+
return true; // 5xx server error
|
|
1272
|
+
if (status && status >= 400 && status < 500)
|
|
1273
|
+
return false; // Fatal client error
|
|
1274
|
+
// Network error or aborted
|
|
1275
|
+
if (error instanceof Error) {
|
|
1276
|
+
const msg = error.message.toLowerCase();
|
|
1277
|
+
if (msg.includes('aborted') ||
|
|
1278
|
+
msg.includes('timeout') ||
|
|
1279
|
+
msg.includes('network') ||
|
|
1280
|
+
msg.includes('fetch')) {
|
|
1281
|
+
return true;
|
|
1282
|
+
}
|
|
1283
|
+
}
|
|
1284
|
+
return false;
|
|
1285
|
+
}
|
|
1286
|
+
async function getHtmlFromUrl(logger, url, referer = 'https://www.google.com/') {
|
|
1287
|
+
const maxRetries = 5;
|
|
1288
|
+
const baseTimeoutMs = 10_000; // Base 10s, increases per attempt
|
|
1289
|
+
let lastError = null;
|
|
1290
|
+
for (let attempt = 1; attempt <= maxRetries; attempt++) {
|
|
1291
|
+
const controller = new AbortController();
|
|
1292
|
+
const timeoutMs = clamp(baseTimeoutMs * Math.pow(1.3, attempt - 1), 5_000, 30_000);
|
|
1293
|
+
const timeout = setTimeout(() => controller.abort(`timeout after ${timeoutMs}ms`), timeoutMs);
|
|
1294
|
+
try {
|
|
1295
|
+
const startedAt = Date.now();
|
|
1296
|
+
const response = await fetch(url, {
|
|
1297
|
+
// mode: 'cors' // Not applicable in Node, left here for behavioral parity with browsers
|
|
1298
|
+
redirect: 'follow',
|
|
1299
|
+
// @ts-expect-error Undici/Fetch in Node may allow duplex; safe to ignore
|
|
1300
|
+
duplex: 'half',
|
|
1301
|
+
signal: controller.signal,
|
|
1302
|
+
headers: {
|
|
1303
|
+
'User-Agent': getRandomUserAgent(), // Randomize User-Agent
|
|
1304
|
+
Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
|
|
1305
|
+
'Accept-Language': 'ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7',
|
|
1306
|
+
Referer: referer, // Include previous page information
|
|
1307
|
+
Connection: 'keep-alive',
|
|
1308
|
+
// Compression is handled automatically by undici/node-fetch, no need to set Accept-Encoding explicitly
|
|
1309
|
+
},
|
|
1310
|
+
});
|
|
1311
|
+
clearTimeout(timeout);
|
|
1312
|
+
const duration = Date.now() - startedAt;
|
|
1313
|
+
const status = response.status;
|
|
1314
|
+
if (!response.ok) {
|
|
1315
|
+
const retryAfterMs = parseRetryAfter(response.headers.get('retry-after'));
|
|
1316
|
+
const canRetry = shouldRetry(status, null);
|
|
1317
|
+
logger.debug({
|
|
1318
|
+
event: 'fetch.error',
|
|
1319
|
+
data: { url, status, attempt, canRetry, duration, retryAfterMs },
|
|
1320
|
+
});
|
|
1321
|
+
if (!canRetry || attempt === maxRetries) {
|
|
1322
|
+
const msg = `Request failed (status=${status}) - ${url}`;
|
|
1323
|
+
lastError = new Error(msg);
|
|
1324
|
+
break;
|
|
1325
|
+
}
|
|
1326
|
+
const backoff = Math.pow(2, attempt - 1) * 1000 + Math.random() * 1000;
|
|
1327
|
+
const delay = retryAfterMs != null ? Math.max(retryAfterMs, backoff) : backoff;
|
|
1328
|
+
await sleep(delay);
|
|
1329
|
+
continue;
|
|
1330
|
+
}
|
|
1331
|
+
const contentType = response.headers.get('content-type') || '';
|
|
1332
|
+
if (!contentType.toLowerCase().includes('text/html')) {
|
|
1333
|
+
// If not HTML, log a warning and continue (keep I/O compatibility)
|
|
1334
|
+
logger.debug({
|
|
1335
|
+
event: 'fetch.nonHtml',
|
|
1336
|
+
data: { url, contentType, attempt },
|
|
1337
|
+
});
|
|
1338
|
+
}
|
|
1339
|
+
const html = await response.text();
|
|
1340
|
+
logger.debug({
|
|
1341
|
+
event: 'fetch.success',
|
|
1342
|
+
data: { url, status, attempt, duration, size: html.length },
|
|
1343
|
+
});
|
|
1344
|
+
// Short randomized sleep after success to reduce server load during crawling
|
|
1345
|
+
const sleepTime = Math.random() * 500 + 250;
|
|
1346
|
+
await sleep(sleepTime);
|
|
1347
|
+
return html;
|
|
1348
|
+
}
|
|
1349
|
+
catch (error) {
|
|
1350
|
+
clearTimeout(timeout);
|
|
1351
|
+
const canRetry = shouldRetry(null, error);
|
|
1352
|
+
lastError = error instanceof Error ? error : new Error(String(error));
|
|
1353
|
+
logger.debug({
|
|
1354
|
+
event: 'fetch.catch',
|
|
1355
|
+
data: { url, attempt, canRetry, error: lastError.message },
|
|
1356
|
+
});
|
|
1357
|
+
if (!canRetry || attempt === maxRetries) {
|
|
1358
|
+
logger.error({
|
|
1359
|
+
event: 'fetch.failed',
|
|
1360
|
+
data: { url, attempt, error: lastError.message },
|
|
1361
|
+
});
|
|
1362
|
+
throw lastError;
|
|
1363
|
+
}
|
|
1364
|
+
// Wait before the next attempt (exponential backoff + jitter)
|
|
1365
|
+
const retryDelay = Math.pow(2, attempt - 1) * 1000 + Math.random() * 1000;
|
|
1366
|
+
await sleep(retryDelay);
|
|
1367
|
+
}
|
|
1368
|
+
}
|
|
1369
|
+
// Should not reach here; keep for type safety
|
|
1370
|
+
throw lastError;
|
|
1371
|
+
}
|
|
1372
|
+
|
|
1373
|
+
class CrawlingChain extends Chain {
|
|
1374
|
+
constructor(config) {
|
|
1375
|
+
const provider = config.provider;
|
|
1376
|
+
provider.maxConcurrency ??= 5;
|
|
1377
|
+
super({ ...config, provider });
|
|
1378
|
+
}
|
|
1379
|
+
get chain() {
|
|
1380
|
+
const mapping = this.provider.crawlingTargetGroups.reduce((result, group) => {
|
|
1381
|
+
return {
|
|
1382
|
+
...result,
|
|
1383
|
+
[group.name]: () => this.executeGroupPipeline(group),
|
|
1384
|
+
};
|
|
1385
|
+
}, {});
|
|
1386
|
+
return RunnablePassthrough.assign(mapping);
|
|
1387
|
+
}
|
|
1388
|
+
async executeGroupPipeline(group) {
|
|
1389
|
+
const groupLabel = group.name;
|
|
1390
|
+
const chain = RunnablePassthrough.assign({
|
|
1391
|
+
listPageHtml: ({ target }) => this.fetchListPageHtml(target),
|
|
1392
|
+
})
|
|
1393
|
+
.pipe({
|
|
1394
|
+
parsedList: ({ target, listPageHtml }) => this.parseListPageHtml(target, listPageHtml),
|
|
1395
|
+
target: ({ target }) => target,
|
|
1396
|
+
})
|
|
1397
|
+
.pipe({
|
|
1398
|
+
list: ({ target, parsedList }) => this.dedupeListItems(target, parsedList),
|
|
1399
|
+
target: ({ target }) => target,
|
|
1400
|
+
})
|
|
1401
|
+
.pipe({
|
|
1402
|
+
detailPagesHtmlWithPipelineId: ({ target, list }) => this.fetchDetailPagesHtml(target, list),
|
|
1403
|
+
target: ({ target }) => target,
|
|
1404
|
+
list: ({ list }) => list,
|
|
1405
|
+
})
|
|
1406
|
+
.pipe({
|
|
1407
|
+
parsedDetails: ({ target, detailPagesHtmlWithPipelineId }) => this.parseDetailPagesHtml(target, detailPagesHtmlWithPipelineId),
|
|
1408
|
+
target: ({ target }) => target,
|
|
1409
|
+
list: ({ list }) => list,
|
|
1410
|
+
})
|
|
1411
|
+
.pipe({
|
|
1412
|
+
processedArticles: ({ target, list, parsedDetails }) => this.mergeParsedArticles(target, list, parsedDetails),
|
|
1413
|
+
target: ({ target }) => target,
|
|
1414
|
+
})
|
|
1415
|
+
.pipe({
|
|
1416
|
+
count: ({ target, processedArticles }) => this.saveArticles(group, target, processedArticles),
|
|
1417
|
+
})
|
|
1418
|
+
.withRetry({ stopAfterAttempt: this.options.chain.stopAfterAttempt });
|
|
1419
|
+
return this.executeWithLogging({
|
|
1420
|
+
event: 'crawl.group',
|
|
1421
|
+
level: 'debug',
|
|
1422
|
+
startFields: {
|
|
1423
|
+
group: groupLabel,
|
|
1424
|
+
targets: group.targets.length,
|
|
1425
|
+
},
|
|
1426
|
+
doneFields: (total) => ({ totalSaved: total }),
|
|
1427
|
+
}, async () => {
|
|
1428
|
+
const results = await chain.batch(group.targets.map((target) => ({ target })), {
|
|
1429
|
+
maxConcurrency: this.provider.maxConcurrency,
|
|
1430
|
+
});
|
|
1431
|
+
return results.reduce((sum, result) => sum + result.count, 0);
|
|
1432
|
+
});
|
|
1433
|
+
}
|
|
1434
|
+
async fetchListPageHtml(target) {
|
|
1435
|
+
return this.executeWithLogging({
|
|
1436
|
+
event: 'crawl.list.fetch',
|
|
1437
|
+
level: 'debug',
|
|
1438
|
+
startFields: { target: this.describeTarget(target) },
|
|
1439
|
+
}, async () => {
|
|
1440
|
+
return await getHtmlFromUrl(this.logger, target.url);
|
|
1441
|
+
});
|
|
1442
|
+
}
|
|
1443
|
+
async parseListPageHtml(target, listPageHtml) {
|
|
1444
|
+
return this.executeWithLogging({
|
|
1445
|
+
event: 'crawl.list.parse',
|
|
1446
|
+
level: 'debug',
|
|
1447
|
+
startFields: {
|
|
1448
|
+
target: this.describeTarget(target),
|
|
1449
|
+
htmlLength: listPageHtml.length,
|
|
1450
|
+
},
|
|
1451
|
+
doneFields: (items) => ({ count: items.length }),
|
|
1452
|
+
}, async () => {
|
|
1453
|
+
return (await target.parseList(listPageHtml)).map((item) => ({
|
|
1454
|
+
...item,
|
|
1455
|
+
pipelineId: randomUUID(),
|
|
1456
|
+
}));
|
|
1457
|
+
});
|
|
1458
|
+
}
|
|
1459
|
+
async dedupeListItems(target, parsedList) {
|
|
1460
|
+
return this.executeWithLogging({
|
|
1461
|
+
event: 'crawl.list.dedupe',
|
|
1462
|
+
level: 'debug',
|
|
1463
|
+
startFields: {
|
|
1464
|
+
target: this.describeTarget(target),
|
|
1465
|
+
inCount: parsedList.length,
|
|
1466
|
+
},
|
|
1467
|
+
doneFields: (deduped) => ({
|
|
1468
|
+
outCount: deduped.length,
|
|
1469
|
+
filtered: parsedList.length - deduped.length,
|
|
1470
|
+
}),
|
|
1471
|
+
}, async () => {
|
|
1472
|
+
const existingArticles = await this.provider.fetchExistingArticlesByUrls(parsedList.map(({ detailUrl }) => detailUrl));
|
|
1473
|
+
const existingUrlSet = new Set(existingArticles.map(({ detailUrl }) => detailUrl));
|
|
1474
|
+
return parsedList.filter((item) => !existingUrlSet.has(item.detailUrl));
|
|
1475
|
+
});
|
|
1476
|
+
}
|
|
1477
|
+
async fetchDetailPagesHtml(target, list) {
|
|
1478
|
+
return this.executeWithLogging({
|
|
1479
|
+
event: 'crawl.detail.fetch',
|
|
1480
|
+
level: 'debug',
|
|
1481
|
+
startFields: {
|
|
1482
|
+
target: this.describeTarget(target),
|
|
1483
|
+
count: list.length,
|
|
1484
|
+
},
|
|
1485
|
+
doneFields: (htmlList) => ({ count: htmlList.length }),
|
|
1486
|
+
}, async () => {
|
|
1487
|
+
const htmlList = await Promise.all(list.map((data) => getHtmlFromUrl(this.logger, data.detailUrl)));
|
|
1488
|
+
return htmlList.map((html, index) => ({
|
|
1489
|
+
pipelineId: list[index].pipelineId,
|
|
1490
|
+
html,
|
|
1491
|
+
}));
|
|
1492
|
+
});
|
|
1493
|
+
}
|
|
1494
|
+
async parseDetailPagesHtml(target, detailPagesHtmlWithPipelineId) {
|
|
1495
|
+
return this.executeWithLogging({
|
|
1496
|
+
event: 'crawl.detail.parse',
|
|
1497
|
+
level: 'debug',
|
|
1498
|
+
startFields: {
|
|
1499
|
+
target: this.describeTarget(target),
|
|
1500
|
+
count: detailPagesHtmlWithPipelineId.length,
|
|
1501
|
+
},
|
|
1502
|
+
doneFields: (details) => ({ count: details.length }),
|
|
1503
|
+
}, async () => {
|
|
1504
|
+
const detail = await Promise.all(detailPagesHtmlWithPipelineId.map(({ html }) => target.parseDetail(html)));
|
|
1505
|
+
return detail.map((detail, index) => ({
|
|
1506
|
+
pipelineId: detailPagesHtmlWithPipelineId[index].pipelineId,
|
|
1507
|
+
...detail,
|
|
1508
|
+
}));
|
|
1509
|
+
});
|
|
1510
|
+
}
|
|
1511
|
+
// Although this is a synchronous method, using async wrapping to maintain consistency with the executeWithLogging interface
|
|
1512
|
+
async mergeParsedArticles(target, list, parsedDetails) {
|
|
1513
|
+
return this.executeWithLogging({
|
|
1514
|
+
event: 'crawl.merge',
|
|
1515
|
+
level: 'debug',
|
|
1516
|
+
startFields: {
|
|
1517
|
+
target: this.describeTarget(target),
|
|
1518
|
+
listCount: list.length,
|
|
1519
|
+
detailCount: parsedDetails.length,
|
|
1520
|
+
},
|
|
1521
|
+
doneFields: (merged) => ({ count: merged.length }),
|
|
1522
|
+
}, async () => {
|
|
1523
|
+
const listItemMap = new Map(list.map((item) => [item.pipelineId, item]));
|
|
1524
|
+
const merged = parsedDetails.map((detail) => {
|
|
1525
|
+
const listItem = listItemMap.get(detail.pipelineId);
|
|
1526
|
+
if (!listItem) {
|
|
1527
|
+
throw new Error(`No matching list item for detail with pipelineId: ${detail.pipelineId}`);
|
|
1528
|
+
}
|
|
1529
|
+
return {
|
|
1530
|
+
...omit(listItem, ['pipelineId']),
|
|
1531
|
+
...omit(detail, ['pipelineId']),
|
|
1532
|
+
};
|
|
1533
|
+
});
|
|
1534
|
+
return merged;
|
|
1535
|
+
});
|
|
1536
|
+
}
|
|
1537
|
+
async saveArticles(group, target, processedArticles) {
|
|
1538
|
+
const omittedGroup = omit(group, ['targets']);
|
|
1539
|
+
return this.executeWithLogging({
|
|
1540
|
+
event: 'crawl.save',
|
|
1541
|
+
level: 'debug',
|
|
1542
|
+
startFields: {
|
|
1543
|
+
group: omittedGroup,
|
|
1544
|
+
target: this.describeTarget(target),
|
|
1545
|
+
count: processedArticles.length,
|
|
1546
|
+
},
|
|
1547
|
+
doneFields: (saved) => ({ saved }),
|
|
1548
|
+
}, async () => {
|
|
1549
|
+
return await this.provider.saveCrawledArticles(processedArticles, {
|
|
1550
|
+
taskId: this.taskId,
|
|
1551
|
+
targetGroup: omittedGroup,
|
|
1552
|
+
target,
|
|
1553
|
+
});
|
|
1554
|
+
});
|
|
1555
|
+
}
|
|
1556
|
+
describeTarget(target) {
|
|
1557
|
+
return {
|
|
1558
|
+
name: target.name || 'unknown',
|
|
1559
|
+
listUrl: target.url,
|
|
1560
|
+
};
|
|
1561
|
+
}
|
|
1562
|
+
}
|
|
1563
|
+
|
|
1564
|
+
/**
|
|
1565
|
+
* Core class that orchestrates LLM-based newsletter generation.
|
|
1566
|
+
* - Responsible for the flow: Crawling → Analysis → Content Generation → Save; external dependencies are injected via DI.
|
|
1567
|
+
*/
|
|
1568
|
+
class GenerateNewsletter {
|
|
1569
|
+
/** Internal fields provided via dependency injection */
|
|
1570
|
+
dateService;
|
|
1571
|
+
taskService;
|
|
1572
|
+
crawlingProvider;
|
|
1573
|
+
analysisProvider;
|
|
1574
|
+
contentGenerateProvider;
|
|
1575
|
+
logger;
|
|
1576
|
+
options;
|
|
1577
|
+
previewNewsletterOptions;
|
|
1578
|
+
/** Independent internal field **/
|
|
1579
|
+
taskId = null;
|
|
1580
|
+
/**
|
|
1581
|
+
* Constructor
|
|
1582
|
+
*
|
|
1583
|
+
* @param config
|
|
1584
|
+
* @example
|
|
1585
|
+
* const generator = new GenerateNewsletter({
|
|
1586
|
+
* outputLanguage: 'English',
|
|
1587
|
+
* expertField: ['AI', 'Cloud'],
|
|
1588
|
+
* dateService,
|
|
1589
|
+
* taskService,
|
|
1590
|
+
* tagProvider,
|
|
1591
|
+
* crawlingProvider,
|
|
1592
|
+
* analysisProvider,
|
|
1593
|
+
* contentGenerateProvider,
|
|
1594
|
+
* options: { llm: { maxRetries: 5 } },
|
|
1595
|
+
* });
|
|
1596
|
+
*/
|
|
1597
|
+
constructor(config) {
|
|
1598
|
+
const defaultOptions = {
|
|
1599
|
+
content: config.contentOptions,
|
|
1600
|
+
llm: { maxRetries: 5 },
|
|
1601
|
+
chain: { stopAfterAttempt: 3 },
|
|
1602
|
+
};
|
|
1603
|
+
this.dateService = config.dateService;
|
|
1604
|
+
this.taskService = config.taskService;
|
|
1605
|
+
this.crawlingProvider = config.crawlingProvider;
|
|
1606
|
+
this.analysisProvider = config.analysisProvider;
|
|
1607
|
+
this.contentGenerateProvider = config.contentGenerateProvider;
|
|
1608
|
+
this.options = {
|
|
1609
|
+
...defaultOptions,
|
|
1610
|
+
...config.options,
|
|
1611
|
+
llm: {
|
|
1612
|
+
...defaultOptions.llm,
|
|
1613
|
+
...config.options?.llm,
|
|
1614
|
+
},
|
|
1615
|
+
chain: {
|
|
1616
|
+
...defaultOptions.chain,
|
|
1617
|
+
...config.options?.chain,
|
|
1618
|
+
},
|
|
1619
|
+
};
|
|
1620
|
+
// Default logger (no-op)
|
|
1621
|
+
this.logger = config.options?.logger ?? {
|
|
1622
|
+
info: (_msg) => { },
|
|
1623
|
+
debug: (_msg) => { },
|
|
1624
|
+
error: (_msg) => { },
|
|
1625
|
+
};
|
|
1626
|
+
// Store preview newsletter options
|
|
1627
|
+
this.previewNewsletterOptions = config.options?.previewNewsletter;
|
|
1628
|
+
}
|
|
1629
|
+
/**
|
|
1630
|
+
* Execute the full newsletter generation pipeline.
|
|
1631
|
+
*/
|
|
1632
|
+
async generate() {
|
|
1633
|
+
const { newsletterId } = await this.executeWithTaskManagement(async () => {
|
|
1634
|
+
const loggingExecutor = new LoggingExecutor(this.logger, this.taskId);
|
|
1635
|
+
const crawlingChain = new CrawlingChain({
|
|
1636
|
+
logger: this.logger,
|
|
1637
|
+
taskId: this.taskId,
|
|
1638
|
+
provider: this.crawlingProvider,
|
|
1639
|
+
options: this.options,
|
|
1640
|
+
loggingExecutor,
|
|
1641
|
+
});
|
|
1642
|
+
const analysisChain = new AnalysisChain({
|
|
1643
|
+
logger: this.logger,
|
|
1644
|
+
taskId: this.taskId,
|
|
1645
|
+
provider: this.analysisProvider,
|
|
1646
|
+
options: this.options,
|
|
1647
|
+
loggingExecutor,
|
|
1648
|
+
dateService: this.dateService,
|
|
1649
|
+
});
|
|
1650
|
+
const contentGenerateChain = new ContentGenerateChain({
|
|
1651
|
+
logger: this.logger,
|
|
1652
|
+
taskId: this.taskId,
|
|
1653
|
+
provider: this.contentGenerateProvider,
|
|
1654
|
+
options: this.options,
|
|
1655
|
+
loggingExecutor,
|
|
1656
|
+
dateService: this.dateService,
|
|
1657
|
+
});
|
|
1658
|
+
const taskChain = RunnableSequence.from([
|
|
1659
|
+
crawlingChain.chain,
|
|
1660
|
+
analysisChain.chain,
|
|
1661
|
+
contentGenerateChain.chain,
|
|
1662
|
+
]);
|
|
1663
|
+
return await taskChain.invoke({});
|
|
1664
|
+
});
|
|
1665
|
+
this.logNewsletterResult(newsletterId);
|
|
1666
|
+
await this.sendPreviewNewsletterIfConfigured(newsletterId);
|
|
1667
|
+
return newsletterId;
|
|
1668
|
+
}
|
|
1669
|
+
/**
|
|
1670
|
+
* Run the pipeline while managing the task lifecycle.
|
|
1671
|
+
*/
|
|
1672
|
+
async executeWithTaskManagement(pipeline) {
|
|
1673
|
+
await this.startTask();
|
|
1674
|
+
const executor = new LoggingExecutor(this.logger, this.taskId);
|
|
1675
|
+
try {
|
|
1676
|
+
return await executor.executeWithLogging({
|
|
1677
|
+
event: 'task',
|
|
1678
|
+
level: 'info',
|
|
1679
|
+
}, async () => {
|
|
1680
|
+
return await pipeline();
|
|
1681
|
+
});
|
|
1682
|
+
}
|
|
1683
|
+
finally {
|
|
1684
|
+
await this.endTask();
|
|
1685
|
+
}
|
|
1686
|
+
}
|
|
1687
|
+
logNewsletterResult(newsletterId) {
|
|
1688
|
+
if (newsletterId === null) {
|
|
1689
|
+
this.logger.info({
|
|
1690
|
+
event: 'generate.result.skipped',
|
|
1691
|
+
taskId: this.taskId,
|
|
1692
|
+
data: { reason: 'publicationCriteria.notMet' },
|
|
1693
|
+
});
|
|
1694
|
+
return;
|
|
1695
|
+
}
|
|
1696
|
+
this.logger.info({
|
|
1697
|
+
event: 'generate.result.created',
|
|
1698
|
+
taskId: this.taskId,
|
|
1699
|
+
data: { newsletterId },
|
|
1700
|
+
});
|
|
1701
|
+
}
|
|
1702
|
+
async sendPreviewNewsletterIfConfigured(newsletterId) {
|
|
1703
|
+
const preview = this.previewNewsletterOptions;
|
|
1704
|
+
if (!preview) {
|
|
1705
|
+
return;
|
|
1706
|
+
}
|
|
1707
|
+
if (newsletterId === null) {
|
|
1708
|
+
this.logger.info({
|
|
1709
|
+
event: 'generate.preview.skip',
|
|
1710
|
+
taskId: this.taskId,
|
|
1711
|
+
data: { reason: 'noNewsletterCreated' },
|
|
1712
|
+
});
|
|
1713
|
+
return;
|
|
1714
|
+
}
|
|
1715
|
+
try {
|
|
1716
|
+
// Fetch newsletter entity for preview
|
|
1717
|
+
const newsletter = await preview.fetchNewsletterForPreview();
|
|
1718
|
+
// Compose email subject/html/text
|
|
1719
|
+
const subject = `[Preview] ${newsletter.title}`;
|
|
1720
|
+
const html = newsletter.htmlBody;
|
|
1721
|
+
const text = `${newsletter.title}\n\nIssue #${newsletter.issueOrder} - ${newsletter.date}`;
|
|
1722
|
+
await preview.emailService.send({
|
|
1723
|
+
...preview.emailMessage,
|
|
1724
|
+
subject,
|
|
1725
|
+
html,
|
|
1726
|
+
text,
|
|
1727
|
+
});
|
|
1728
|
+
this.logger.info({
|
|
1729
|
+
event: 'generate.preview.sent',
|
|
1730
|
+
taskId: this.taskId,
|
|
1731
|
+
data: {
|
|
1732
|
+
newsletterId,
|
|
1733
|
+
to: preview.emailMessage.to,
|
|
1734
|
+
},
|
|
1735
|
+
});
|
|
1736
|
+
}
|
|
1737
|
+
catch (err) {
|
|
1738
|
+
this.logger.error({
|
|
1739
|
+
event: 'generate.preview.error',
|
|
1740
|
+
taskId: this.taskId,
|
|
1741
|
+
data: { newsletterId },
|
|
1742
|
+
});
|
|
1743
|
+
this.logger.error(err);
|
|
1744
|
+
}
|
|
1745
|
+
}
|
|
1746
|
+
async startTask() {
|
|
1747
|
+
this.taskId = await this.taskService.start();
|
|
1748
|
+
}
|
|
1749
|
+
async endTask() {
|
|
1750
|
+
await this.taskService.end();
|
|
1751
|
+
}
|
|
1752
|
+
}
|
|
1753
|
+
|
|
1754
|
+
export { DateType, GenerateNewsletter };
|
|
1755
|
+
//# sourceMappingURL=index.js.map
|