harvester_sdk 1.0.9 → 1.0.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/types.ts CHANGED
@@ -113,65 +113,228 @@ export const zodGeoSelectionSchema = z.object({
113
113
  updated_at: z.number().optional(), // last update date
114
114
  });
115
115
 
116
+ // Add specific metadata schemas for better type safety while keeping flexibility
117
+ export const telegramMetadataSchema = z.object({
118
+ channel_id: z.string(),
119
+ views: z.number().optional(),
120
+ forwards: z.number().optional(),
121
+ reactions: z.array(z.any()).optional(),
122
+ hashtags: z.array(z.string()).optional(),
123
+ mentions: z.array(z.string()).optional(),
124
+ is_pinned: z.boolean().optional(),
125
+ is_edited: z.boolean().optional(),
126
+ edit_date: z.number().optional(),
127
+ post_author: z.string().nullable(),
128
+ via_bot_id: z.string().optional(),
129
+ grouped_id: z.string().optional(),
130
+ is_silent: z.boolean().optional(),
131
+ forwarded_from_id: z.string().optional(),
132
+ forwarded_date: z.number().optional(),
133
+ });
134
+
135
+ export const facebookMetadataSchema = z.object({
136
+ post_id: z.string(),
137
+ likes: z.number().optional(),
138
+ shares: z.number().optional(),
139
+ comments_count: z.number().optional(),
140
+ reactions: z.record(z.string(), z.number()).optional(), // e.g., { "like": 10, "love": 5 }
141
+ is_pinned: z.boolean().optional(),
142
+ is_edited: z.boolean().optional(),
143
+ edit_date: z.number().optional(),
144
+ });
145
+
146
+ export const instagramMetadataSchema = z.object({
147
+ post_id: z.string(),
148
+ likes: z.number().optional(),
149
+ comments_count: z.number().optional(),
150
+ hashtags: z.array(z.string()).optional(),
151
+ mentions: z.array(z.string()).optional(),
152
+ location: z.string().optional(),
153
+ is_reel: z.boolean().optional(),
154
+ is_story: z.boolean().optional(),
155
+ });
156
+
157
+ export const tiktokMetadataSchema = z.object({
158
+ video_id: z.string(),
159
+ likes: z.number().optional(),
160
+ shares: z.number().optional(),
161
+ comments_count: z.number().optional(),
162
+ views: z.number().optional(),
163
+ hashtags: z.array(z.string()).optional(),
164
+ mentions: z.array(z.string()).optional(),
165
+ sound_name: z.string().optional(),
166
+ sound_id: z.string().optional(),
167
+ duration: z.number().optional(),
168
+ });
169
+
170
+ export const websiteMetadataSchema = z.object({
171
+ article_id: z.string().optional(),
172
+ url: z.string().url(),
173
+ title: z.string().optional(),
174
+ author: z.string().optional(),
175
+ publish_date: z.number().optional(),
176
+ category: z.string().optional(),
177
+ tags: z.array(z.string()).optional(),
178
+ });
179
+
180
+ // Improved media schema with more types and optional dimensions
181
+ export const mediaItemSchema = z.object({
182
+ type: z.enum(['image', 'video', 'audio', 'link', 'document', 'gif', 'sticker']),
183
+ url: z.string(), // Can be a URL or file ID/path
184
+ caption: z.string().optional(),
185
+ thumbnail_url: z.string().optional(),
186
+ width: z.number().optional(),
187
+ height: z.number().optional(),
188
+ duration: z.number().optional(), // for video/audio in seconds
189
+ size: z.number().optional(), // file size in bytes
190
+ mime_type: z.string().optional(),
191
+ });
192
+
193
+ // Improved replies schema
194
+ export const repliesInfoSchema = z.object({
195
+ count: z.number().default(0),
196
+ recent_repliers: z.array(z.string()).optional(),
197
+ has_thread: z.boolean().optional(),
198
+ thread_id: z.string().optional(),
199
+ });
200
+
201
+ // Improved author schema for better consistency
202
+ export const authorSchema = z.object({
203
+ id: z.string().optional(),
204
+ username: z.string().optional(),
205
+ display_name: z.string().optional(),
206
+ avatar_url: z.string().optional(),
207
+ is_verified: z.boolean().optional(),
208
+ follower_count: z.number().optional(),
209
+ });
210
+
116
211
  export const zodDataSchema = z.object({
117
212
  _id: z.string().optional(),
118
- timestamp: z.number().optional(), // INDEX - date in milliseconds - e.g., 1751210833000
119
- platform: z.enum(platformsList).optional(), // e.g., 'telegram', 'facebook'
120
- source_region_id: z.string().optional(), // e.g., 'hebron' -> get from source object
121
- source_region_title: z.string().optional(), // e.g., 'hebron' -> get from source object
122
- source_dominant_geos: z.array(z.string()).optional(), // e.g., ['hebron', 'west bank'] -> get from source object
123
- source_id: z.string().or(z.number()), // INDEX - reference to Source _id (e.g., '60c72b2f9b1e8d3f4c8b4567') -> get from source object
124
- source_title: z.string().optional(), // e.g., 'Telegram Channel Name' -> get from source object
125
- source_url: z.string().url().optional(), // e.g., 'https://t.me/telegram_channel_name' -> get from source object
126
- source_group_id: z.string().optional(), // e.g., 'default_pipeline' -> get from source object
127
- data_id: z.string().or(z.number()).optional(), // INDEX - original text ID (e.g., message_id) -> get from post
128
- data_geo: z.array(z.string()).optional(), // INDEX - e.g., ["sinjil", "ramallah", "west bank"] -> get from processor
129
- data_text: z.string().optional(), // processed text content -> get from post or processor
130
- data_url: z.string().url().optional(), // original text URL if available -> get from post
131
- data_original_type: z.string().optional(), // e.g., 'post', 'comment', 'reply', 'video', 'image' -> get from post
132
- data_language: z.string().optional(), // detected language of the text -> get from post or processor
133
- data_sentiment: z.string().optional(), // sentiment analysis result -> get from processor
134
- data_timestamp: z.number().optional(), // original post timestamp if different from ingestion timestamp -> get from post
135
- is_reply: z.boolean().optional(), // true if this text is a reply to another text -> get from post
136
- reply_to_message_id: z.string().or(z.number()).optional(), // ID of the message this is a reply to -> get from post
137
- metadata: z.record(z.string(), z.any()).optional(), // platform-specific fields -> get from post
213
+
214
+ // Timestamps
215
+ timestamp: z.number().optional(), // INDEX - ingestion timestamp
216
+ data_timestamp: z.number().optional(), // original post timestamp
138
217
  created_at: z.number(),
139
218
  updated_at: z.number(),
140
- media: z
141
- .array(
142
- z.object({
143
- type: z.enum(['image', 'video', 'audio', 'link']),
144
- url: z.string().url(),
145
- caption: z.string().optional(),
146
- })
147
- )
148
- .optional(), // media attachments -> get from post
149
- author: z.string().optional(), // e.g., author name or ID -> get from post
150
- author_username: z.string().optional(), // e.g., author username -> get from post
151
- replies: z.any().optional(), // array of reply texts or IDs -> get from post
152
- author_id: z.string().optional(), // e.g., author ID -> get from post
153
- // translated_text: z.string().optional(), // translated text if available - most of the time it will be translated to English
154
- // entities: z.any().optional(), // array of reply texts or IDs
155
- // source_public_id: z.string().or(z.number()), // e.g., 'telegram:1234567890' (message_id)
156
- // platform_id: z.string().or(z.number()), // channel_id reference to Source source_id
157
- // original_text_id: z.string().or(z.number()), // message_id
219
+
220
+ // Platform & Source Info (denormalized for query performance)
221
+ platform: z.enum(platformsList).optional(),
222
+ source_id: z.string().or(z.number()), // INDEX - reference to Source _id
223
+ source_title: z.string().optional(),
224
+ source_url: z.string().url().optional(),
225
+ source_region_id: z.string().optional(), // INDEX
226
+ source_region_title: z.string().optional(),
227
+ source_group_id: z.string().optional(), // INDEX
228
+ source_dominant_geos: z.array(z.string()).optional().default([]),
229
+
230
+ // Data/Post Identifiers
231
+ data_id: z.string().or(z.number()).optional(), // INDEX - platform-specific post ID
232
+ data_url: z.string().url().optional(), // direct link to the post
233
+ data_original_type: z.enum([
234
+ 'post',
235
+ 'comment',
236
+ 'reply',
237
+ 'video',
238
+ 'image',
239
+ 'photo',
240
+ 'story',
241
+ 'reel',
242
+ 'article',
243
+ 'link',
244
+ 'document',
245
+ ]).optional(),
246
+
247
+ // Content
248
+ data_text: z.string().optional(),
249
+ data_language: z.string().optional(), // ISO 639-1 code (e.g., 'en', 'ar')
250
+ data_geo: z.array(z.string()).optional().default([]), // INDEX - extracted locations
251
+
252
+ // Analysis (populated by processors)
253
+ data_sentiment: z.enum(['positive', 'negative', 'neutral', 'mixed']).optional(),
254
+ data_topics: z.array(z.string()).optional(), // extracted topics/themes
255
+ data_keywords: z.array(z.string()).optional(), // extracted keywords
256
+
257
+ // Media attachments
258
+ media: z.array(mediaItemSchema).optional().default([]),
259
+
260
+ // Author information
261
+ author: z.string().optional(), // legacy field - display name
262
+ author_username: z.string().optional(), // legacy field
263
+ author_id: z.string().optional(), // legacy field
264
+ author_info: authorSchema.optional(), // NEW - structured author info
265
+
266
+ // Reply/Thread information
267
+ is_reply: z.boolean().optional().default(false),
268
+ reply_to_message_id: z.string().or(z.number()).optional(),
269
+ reply_to_author_id: z.string().optional(),
270
+ replies: z.any().optional(), // legacy field - keep for backward compatibility
271
+ replies_info: repliesInfoSchema.optional(), // NEW - structured replies info
272
+
273
+ // Engagement metrics (platform-specific)
274
+ engagement: z.object({
275
+ views: z.number().optional(),
276
+ likes: z.number().optional(),
277
+ shares: z.number().optional(),
278
+ comments: z.number().optional(),
279
+ reactions: z.record(z.string(), z.number()).optional(), // e.g., { "like": 10, "love": 5 }
280
+ }).optional(),
281
+
282
+ // Content flags
283
+ is_edited: z.boolean().optional(),
284
+ edit_date: z.number().optional(),
285
+ is_pinned: z.boolean().optional(),
286
+ is_deleted: z.boolean().optional(),
287
+ is_forwarded: z.boolean().optional(),
288
+ forwarded_from_id: z.string().optional(),
289
+ forwarded_date: z.number().optional(),
290
+
291
+ // Hashtags and mentions (extracted for easier querying)
292
+ hashtags: z.array(z.string()).optional().default([]),
293
+ mentions: z.array(z.string()).optional().default([]),
294
+
295
+ // Platform-specific metadata (flexible)
296
+ metadata: z.union([
297
+ telegramMetadataSchema,
298
+ facebookMetadataSchema,
299
+ instagramMetadataSchema,
300
+ tiktokMetadataSchema,
301
+ websiteMetadataSchema,
302
+ z.record(z.string(), z.any()), // fallback for unknown platforms
303
+ ]).optional(),
304
+
305
+ processing_errors: z.string().optional(),
306
+
158
307
  });
159
308
 
309
+ // Add validation refinement for platform-specific fields
310
+ export const zodDataSchemaWithValidation = zodDataSchema.refine(
311
+ (data) => {
312
+ // Ensure data_id exists for most platforms except websites
313
+ if (data.platform && data.platform !== 'website' && !data.data_id) {
314
+ return false;
315
+ }
316
+ return true;
317
+ },
318
+ {
319
+ message: "data_id is required for non-website platforms",
320
+ path: ["data_id"],
321
+ }
322
+ );
323
+
160
324
  export type RegionType = z.infer<typeof zodRegionSchema>;
161
325
  export type SourceGroupType = z.infer<typeof zodSourceGroupSchema>;
162
326
  export type SourceType = z.infer<typeof zodSourceSchema>;
163
327
  export type DataType = z.infer<typeof zodDataSchema>;
164
- export type SourceStatusType = (typeof sourceStatusList)[number];
165
- export type StatusType = (typeof generalStatusList)[number];
166
- export type TimeRangeTypeLiteral = 'relative' | 'absolute';
167
- export type AddSourceToReviewType = Pick<
168
- SourceType,
169
- 'platform' | 'url' | 'description'
170
- > &
171
- Partial<Pick<SourceType, 'public_id'>>;
172
-
173
- export type GeoType = z.infer<typeof zodGeoSchema>;
174
- export type GeoSelectionType = z.infer<typeof zodGeoSelectionSchema>;
328
+ export type MediaItemType = z.infer<typeof mediaItemSchema>;
329
+ export type AuthorType = z.infer<typeof authorSchema>;
330
+ export type RepliesInfoType = z.infer<typeof repliesInfoSchema>;
331
+
332
+ // Platform-specific metadata types
333
+ export type TelegramMetadataType = z.infer<typeof telegramMetadataSchema>;
334
+ export type FacebookMetadataType = z.infer<typeof facebookMetadataSchema>;
335
+ export type InstagramMetadataType = z.infer<typeof instagramMetadataSchema>;
336
+ export type TiktokMetadataType = z.infer<typeof tiktokMetadataSchema>;
337
+ export type WebsiteMetadataType = z.infer<typeof websiteMetadataSchema>;
175
338
 
176
339
  // Helper type to get allowed entities for a specific platform
177
340
  export type PlatformEntityType<T extends (typeof platformsList)[number]> =