harvester_sdk 1.0.8 → 1.0.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +198 -42
- package/dist/index.js +105 -15
- package/dist/types.d.ts +1507 -46
- package/dist/types.js +167 -27
- package/index.ts +129 -27
- package/package.json +1 -1
- package/types.ts +212 -49
package/types.ts
CHANGED
|
@@ -113,65 +113,228 @@ export const zodGeoSelectionSchema = z.object({
|
|
|
113
113
|
updated_at: z.number().optional(), // last update date
|
|
114
114
|
});
|
|
115
115
|
|
|
116
|
+
// Add specific metadata schemas for better type safety while keeping flexibility
|
|
117
|
+
export const telegramMetadataSchema = z.object({
|
|
118
|
+
channel_id: z.string(),
|
|
119
|
+
views: z.number().optional(),
|
|
120
|
+
forwards: z.number().optional(),
|
|
121
|
+
reactions: z.array(z.any()).optional(),
|
|
122
|
+
hashtags: z.array(z.string()).optional(),
|
|
123
|
+
mentions: z.array(z.string()).optional(),
|
|
124
|
+
is_pinned: z.boolean().optional(),
|
|
125
|
+
is_edited: z.boolean().optional(),
|
|
126
|
+
edit_date: z.number().optional(),
|
|
127
|
+
post_author: z.string().nullable(),
|
|
128
|
+
via_bot_id: z.string().optional(),
|
|
129
|
+
grouped_id: z.string().optional(),
|
|
130
|
+
is_silent: z.boolean().optional(),
|
|
131
|
+
forwarded_from_id: z.string().optional(),
|
|
132
|
+
forwarded_date: z.number().optional(),
|
|
133
|
+
});
|
|
134
|
+
|
|
135
|
+
export const facebookMetadataSchema = z.object({
|
|
136
|
+
post_id: z.string(),
|
|
137
|
+
likes: z.number().optional(),
|
|
138
|
+
shares: z.number().optional(),
|
|
139
|
+
comments_count: z.number().optional(),
|
|
140
|
+
reactions: z.record(z.string(), z.number()).optional(), // e.g., { "like": 10, "love": 5 }
|
|
141
|
+
is_pinned: z.boolean().optional(),
|
|
142
|
+
is_edited: z.boolean().optional(),
|
|
143
|
+
edit_date: z.number().optional(),
|
|
144
|
+
});
|
|
145
|
+
|
|
146
|
+
export const instagramMetadataSchema = z.object({
|
|
147
|
+
post_id: z.string(),
|
|
148
|
+
likes: z.number().optional(),
|
|
149
|
+
comments_count: z.number().optional(),
|
|
150
|
+
hashtags: z.array(z.string()).optional(),
|
|
151
|
+
mentions: z.array(z.string()).optional(),
|
|
152
|
+
location: z.string().optional(),
|
|
153
|
+
is_reel: z.boolean().optional(),
|
|
154
|
+
is_story: z.boolean().optional(),
|
|
155
|
+
});
|
|
156
|
+
|
|
157
|
+
export const tiktokMetadataSchema = z.object({
|
|
158
|
+
video_id: z.string(),
|
|
159
|
+
likes: z.number().optional(),
|
|
160
|
+
shares: z.number().optional(),
|
|
161
|
+
comments_count: z.number().optional(),
|
|
162
|
+
views: z.number().optional(),
|
|
163
|
+
hashtags: z.array(z.string()).optional(),
|
|
164
|
+
mentions: z.array(z.string()).optional(),
|
|
165
|
+
sound_name: z.string().optional(),
|
|
166
|
+
sound_id: z.string().optional(),
|
|
167
|
+
duration: z.number().optional(),
|
|
168
|
+
});
|
|
169
|
+
|
|
170
|
+
export const websiteMetadataSchema = z.object({
|
|
171
|
+
article_id: z.string().optional(),
|
|
172
|
+
url: z.string().url(),
|
|
173
|
+
title: z.string().optional(),
|
|
174
|
+
author: z.string().optional(),
|
|
175
|
+
publish_date: z.number().optional(),
|
|
176
|
+
category: z.string().optional(),
|
|
177
|
+
tags: z.array(z.string()).optional(),
|
|
178
|
+
});
|
|
179
|
+
|
|
180
|
+
// Improved media schema with more types and optional dimensions
|
|
181
|
+
export const mediaItemSchema = z.object({
|
|
182
|
+
type: z.enum(['image', 'video', 'audio', 'link', 'document', 'gif', 'sticker']),
|
|
183
|
+
url: z.string(), // Can be a URL or file ID/path
|
|
184
|
+
caption: z.string().optional(),
|
|
185
|
+
thumbnail_url: z.string().optional(),
|
|
186
|
+
width: z.number().optional(),
|
|
187
|
+
height: z.number().optional(),
|
|
188
|
+
duration: z.number().optional(), // for video/audio in seconds
|
|
189
|
+
size: z.number().optional(), // file size in bytes
|
|
190
|
+
mime_type: z.string().optional(),
|
|
191
|
+
});
|
|
192
|
+
|
|
193
|
+
// Improved replies schema
|
|
194
|
+
export const repliesInfoSchema = z.object({
|
|
195
|
+
count: z.number().default(0),
|
|
196
|
+
recent_repliers: z.array(z.string()).optional(),
|
|
197
|
+
has_thread: z.boolean().optional(),
|
|
198
|
+
thread_id: z.string().optional(),
|
|
199
|
+
});
|
|
200
|
+
|
|
201
|
+
// Improved author schema for better consistency
|
|
202
|
+
export const authorSchema = z.object({
|
|
203
|
+
id: z.string().optional(),
|
|
204
|
+
username: z.string().optional(),
|
|
205
|
+
display_name: z.string().optional(),
|
|
206
|
+
avatar_url: z.string().optional(),
|
|
207
|
+
is_verified: z.boolean().optional(),
|
|
208
|
+
follower_count: z.number().optional(),
|
|
209
|
+
});
|
|
210
|
+
|
|
116
211
|
export const zodDataSchema = z.object({
|
|
117
212
|
_id: z.string().optional(),
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
source_dominant_geos: z.array(z.string()).optional(), // e.g., ['hebron', 'west bank'] -> get from source object
|
|
123
|
-
source_id: z.string().or(z.number()), // INDEX - reference to Source _id (e.g., '60c72b2f9b1e8d3f4c8b4567') -> get from source object
|
|
124
|
-
source_title: z.string().optional(), // e.g., 'Telegram Channel Name' -> get from source object
|
|
125
|
-
source_url: z.string().url().optional(), // e.g., 'https://t.me/telegram_channel_name' -> get from source object
|
|
126
|
-
source_group_id: z.string().optional(), // e.g., 'default_pipeline' -> get from source object
|
|
127
|
-
data_id: z.string().or(z.number()).optional(), // INDEX - original text ID (e.g., message_id) -> get from post
|
|
128
|
-
data_geo: z.array(z.string()).optional(), // INDEX - e.g., ["sinjil", "ramallah", "west bank"] -> get from processor
|
|
129
|
-
data_text: z.string().optional(), // processed text content -> get from post or processor
|
|
130
|
-
data_url: z.string().url().optional(), // original text URL if available -> get from post
|
|
131
|
-
data_original_type: z.string().optional(), // e.g., 'post', 'comment', 'reply', 'video', 'image' -> get from post
|
|
132
|
-
data_language: z.string().optional(), // detected language of the text -> get from post or processor
|
|
133
|
-
data_sentiment: z.string().optional(), // sentiment analysis result -> get from processor
|
|
134
|
-
data_timestamp: z.number().optional(), // original post timestamp if different from ingestion timestamp -> get from post
|
|
135
|
-
is_reply: z.boolean().optional(), // true if this text is a reply to another text -> get from post
|
|
136
|
-
reply_to_message_id: z.string().or(z.number()).optional(), // ID of the message this is a reply to -> get from post
|
|
137
|
-
metadata: z.record(z.string(), z.any()).optional(), // platform-specific fields -> get from post
|
|
213
|
+
|
|
214
|
+
// Timestamps
|
|
215
|
+
timestamp: z.number().optional(), // INDEX - ingestion timestamp
|
|
216
|
+
data_timestamp: z.number().optional(), // original post timestamp
|
|
138
217
|
created_at: z.number(),
|
|
139
218
|
updated_at: z.number(),
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
219
|
+
|
|
220
|
+
// Platform & Source Info (denormalized for query performance)
|
|
221
|
+
platform: z.enum(platformsList).optional(),
|
|
222
|
+
source_id: z.string().or(z.number()), // INDEX - reference to Source _id
|
|
223
|
+
source_title: z.string().optional(),
|
|
224
|
+
source_url: z.string().url().optional(),
|
|
225
|
+
source_region_id: z.string().optional(), // INDEX
|
|
226
|
+
source_region_title: z.string().optional(),
|
|
227
|
+
source_group_id: z.string().optional(), // INDEX
|
|
228
|
+
source_dominant_geos: z.array(z.string()).optional().default([]),
|
|
229
|
+
|
|
230
|
+
// Data/Post Identifiers
|
|
231
|
+
data_id: z.string().or(z.number()).optional(), // INDEX - platform-specific post ID
|
|
232
|
+
data_url: z.string().url().optional(), // direct link to the post
|
|
233
|
+
data_original_type: z.enum([
|
|
234
|
+
'post',
|
|
235
|
+
'comment',
|
|
236
|
+
'reply',
|
|
237
|
+
'video',
|
|
238
|
+
'image',
|
|
239
|
+
'photo',
|
|
240
|
+
'story',
|
|
241
|
+
'reel',
|
|
242
|
+
'article',
|
|
243
|
+
'link',
|
|
244
|
+
'document',
|
|
245
|
+
]).optional(),
|
|
246
|
+
|
|
247
|
+
// Content
|
|
248
|
+
data_text: z.string().optional(),
|
|
249
|
+
data_language: z.string().optional(), // ISO 639-1 code (e.g., 'en', 'ar')
|
|
250
|
+
data_geo: z.array(z.string()).optional().default([]), // INDEX - extracted locations
|
|
251
|
+
|
|
252
|
+
// Analysis (populated by processors)
|
|
253
|
+
data_sentiment: z.enum(['positive', 'negative', 'neutral', 'mixed']).optional(),
|
|
254
|
+
data_topics: z.array(z.string()).optional(), // extracted topics/themes
|
|
255
|
+
data_keywords: z.array(z.string()).optional(), // extracted keywords
|
|
256
|
+
|
|
257
|
+
// Media attachments
|
|
258
|
+
media: z.array(mediaItemSchema).optional().default([]),
|
|
259
|
+
|
|
260
|
+
// Author information
|
|
261
|
+
author: z.string().optional(), // legacy field - display name
|
|
262
|
+
author_username: z.string().optional(), // legacy field
|
|
263
|
+
author_id: z.string().optional(), // legacy field
|
|
264
|
+
author_info: authorSchema.optional(), // NEW - structured author info
|
|
265
|
+
|
|
266
|
+
// Reply/Thread information
|
|
267
|
+
is_reply: z.boolean().optional().default(false),
|
|
268
|
+
reply_to_message_id: z.string().or(z.number()).optional(),
|
|
269
|
+
reply_to_author_id: z.string().optional(),
|
|
270
|
+
replies: z.any().optional(), // legacy field - keep for backward compatibility
|
|
271
|
+
replies_info: repliesInfoSchema.optional(), // NEW - structured replies info
|
|
272
|
+
|
|
273
|
+
// Engagement metrics (platform-specific)
|
|
274
|
+
engagement: z.object({
|
|
275
|
+
views: z.number().optional(),
|
|
276
|
+
likes: z.number().optional(),
|
|
277
|
+
shares: z.number().optional(),
|
|
278
|
+
comments: z.number().optional(),
|
|
279
|
+
reactions: z.record(z.string(), z.number()).optional(), // e.g., { "like": 10, "love": 5 }
|
|
280
|
+
}).optional(),
|
|
281
|
+
|
|
282
|
+
// Content flags
|
|
283
|
+
is_edited: z.boolean().optional(),
|
|
284
|
+
edit_date: z.number().optional(),
|
|
285
|
+
is_pinned: z.boolean().optional(),
|
|
286
|
+
is_deleted: z.boolean().optional(),
|
|
287
|
+
is_forwarded: z.boolean().optional(),
|
|
288
|
+
forwarded_from_id: z.string().optional(),
|
|
289
|
+
forwarded_date: z.number().optional(),
|
|
290
|
+
|
|
291
|
+
// Hashtags and mentions (extracted for easier querying)
|
|
292
|
+
hashtags: z.array(z.string()).optional().default([]),
|
|
293
|
+
mentions: z.array(z.string()).optional().default([]),
|
|
294
|
+
|
|
295
|
+
// Platform-specific metadata (flexible)
|
|
296
|
+
metadata: z.union([
|
|
297
|
+
telegramMetadataSchema,
|
|
298
|
+
facebookMetadataSchema,
|
|
299
|
+
instagramMetadataSchema,
|
|
300
|
+
tiktokMetadataSchema,
|
|
301
|
+
websiteMetadataSchema,
|
|
302
|
+
z.record(z.string(), z.any()), // fallback for unknown platforms
|
|
303
|
+
]).optional(),
|
|
304
|
+
|
|
305
|
+
processing_errors: z.string().optional(),
|
|
306
|
+
|
|
158
307
|
});
|
|
159
308
|
|
|
309
|
+
// Add validation refinement for platform-specific fields
|
|
310
|
+
export const zodDataSchemaWithValidation = zodDataSchema.refine(
|
|
311
|
+
(data) => {
|
|
312
|
+
// Ensure data_id exists for most platforms except websites
|
|
313
|
+
if (data.platform && data.platform !== 'website' && !data.data_id) {
|
|
314
|
+
return false;
|
|
315
|
+
}
|
|
316
|
+
return true;
|
|
317
|
+
},
|
|
318
|
+
{
|
|
319
|
+
message: "data_id is required for non-website platforms",
|
|
320
|
+
path: ["data_id"],
|
|
321
|
+
}
|
|
322
|
+
);
|
|
323
|
+
|
|
160
324
|
export type RegionType = z.infer<typeof zodRegionSchema>;
|
|
161
325
|
export type SourceGroupType = z.infer<typeof zodSourceGroupSchema>;
|
|
162
326
|
export type SourceType = z.infer<typeof zodSourceSchema>;
|
|
163
327
|
export type DataType = z.infer<typeof zodDataSchema>;
|
|
164
|
-
export type
|
|
165
|
-
export type
|
|
166
|
-
export type
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
export type
|
|
174
|
-
export type GeoSelectionType = z.infer<typeof zodGeoSelectionSchema>;
|
|
328
|
+
export type MediaItemType = z.infer<typeof mediaItemSchema>;
|
|
329
|
+
export type AuthorType = z.infer<typeof authorSchema>;
|
|
330
|
+
export type RepliesInfoType = z.infer<typeof repliesInfoSchema>;
|
|
331
|
+
|
|
332
|
+
// Platform-specific metadata types
|
|
333
|
+
export type TelegramMetadataType = z.infer<typeof telegramMetadataSchema>;
|
|
334
|
+
export type FacebookMetadataType = z.infer<typeof facebookMetadataSchema>;
|
|
335
|
+
export type InstagramMetadataType = z.infer<typeof instagramMetadataSchema>;
|
|
336
|
+
export type TiktokMetadataType = z.infer<typeof tiktokMetadataSchema>;
|
|
337
|
+
export type WebsiteMetadataType = z.infer<typeof websiteMetadataSchema>;
|
|
175
338
|
|
|
176
339
|
// Helper type to get allowed entities for a specific platform
|
|
177
340
|
export type PlatformEntityType<T extends (typeof platformsList)[number]> =
|