npm - harvester_sdk - Versions diffs - 1.0.8 → 1.0.10 - Mend

harvester_sdk 1.0.8 → 1.0.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/types.ts CHANGED Viewed

@@ -113,65 +113,228 @@ export const zodGeoSelectionSchema = z.object({
   updated_at: z.number().optional(), // last update date
 });
+// Add specific metadata schemas for better type safety while keeping flexibility
+export const telegramMetadataSchema = z.object({
+  channel_id: z.string(),
+  views: z.number().optional(),
+  forwards: z.number().optional(),
+  reactions: z.array(z.any()).optional(),
+  hashtags: z.array(z.string()).optional(),
+  mentions: z.array(z.string()).optional(),
+  is_pinned: z.boolean().optional(),
+  is_edited: z.boolean().optional(),
+  edit_date: z.number().optional(),
+  post_author: z.string().nullable(),
+  via_bot_id: z.string().optional(),
+  grouped_id: z.string().optional(),
+  is_silent: z.boolean().optional(),
+  forwarded_from_id: z.string().optional(),
+  forwarded_date: z.number().optional(),
+});
+export const facebookMetadataSchema = z.object({
+  post_id: z.string(),
+  likes: z.number().optional(),
+  shares: z.number().optional(),
+  comments_count: z.number().optional(),
+  reactions: z.record(z.string(), z.number()).optional(), // e.g., { "like": 10, "love": 5 }
+  is_pinned: z.boolean().optional(),
+  is_edited: z.boolean().optional(),
+  edit_date: z.number().optional(),
+});
+export const instagramMetadataSchema = z.object({
+  post_id: z.string(),
+  likes: z.number().optional(),
+  comments_count: z.number().optional(),
+  hashtags: z.array(z.string()).optional(),
+  mentions: z.array(z.string()).optional(),
+  location: z.string().optional(),
+  is_reel: z.boolean().optional(),
+  is_story: z.boolean().optional(),
+});
+export const tiktokMetadataSchema = z.object({
+  video_id: z.string(),
+  likes: z.number().optional(),
+  shares: z.number().optional(),
+  comments_count: z.number().optional(),
+  views: z.number().optional(),
+  hashtags: z.array(z.string()).optional(),
+  mentions: z.array(z.string()).optional(),
+  sound_name: z.string().optional(),
+  sound_id: z.string().optional(),
+  duration: z.number().optional(),
+});
+export const websiteMetadataSchema = z.object({
+  article_id: z.string().optional(),
+  url: z.string().url(),
+  title: z.string().optional(),
+  author: z.string().optional(),
+  publish_date: z.number().optional(),
+  category: z.string().optional(),
+  tags: z.array(z.string()).optional(),
+});
+// Improved media schema with more types and optional dimensions
+export const mediaItemSchema = z.object({
+  type: z.enum(['image', 'video', 'audio', 'link', 'document', 'gif', 'sticker']),
+  url: z.string(), // Can be a URL or file ID/path
+  caption: z.string().optional(),
+  thumbnail_url: z.string().optional(),
+  width: z.number().optional(),
+  height: z.number().optional(),
+  duration: z.number().optional(), // for video/audio in seconds
+  size: z.number().optional(), // file size in bytes
+  mime_type: z.string().optional(),
+});
+// Improved replies schema
+export const repliesInfoSchema = z.object({
+  count: z.number().default(0),
+  recent_repliers: z.array(z.string()).optional(),
+  has_thread: z.boolean().optional(),
+  thread_id: z.string().optional(),
+});
+// Improved author schema for better consistency
+export const authorSchema = z.object({
+  id: z.string().optional(),
+  username: z.string().optional(),
+  display_name: z.string().optional(),
+  avatar_url: z.string().optional(),
+  is_verified: z.boolean().optional(),
+  follower_count: z.number().optional(),
+});
 export const zodDataSchema = z.object({
   _id: z.string().optional(),
-  timestamp: z.number().optional(), // INDEX - date in milliseconds - e.g., 1751210833000
-  platform: z.enum(platformsList).optional(), // e.g., 'telegram', 'facebook'
-  source_region_id: z.string().optional(), // e.g., 'hebron' -> get from source object
-  source_region_title: z.string().optional(), // e.g., 'hebron' -> get from source object
-  source_dominant_geos: z.array(z.string()).optional(), // e.g., ['hebron', 'west bank'] -> get from source object
-  source_id: z.string().or(z.number()), // INDEX - reference to Source _id (e.g., '60c72b2f9b1e8d3f4c8b4567') -> get from source object
-  source_title: z.string().optional(), // e.g., 'Telegram Channel Name' -> get from source object
-  source_url: z.string().url().optional(), // e.g., 'https://t.me/telegram_channel_name' -> get from source object
-  source_group_id: z.string().optional(), // e.g., 'default_pipeline' -> get from source object
-  data_id: z.string().or(z.number()).optional(), // INDEX - original text ID (e.g., message_id) -> get from post
-  data_geo: z.array(z.string()).optional(), // INDEX - e.g., ["sinjil", "ramallah", "west bank"] -> get from processor
-  data_text: z.string().optional(), // processed text content -> get from post or processor
-  data_url: z.string().url().optional(), // original text URL if available -> get from post
-  data_original_type: z.string().optional(), // e.g., 'post', 'comment', 'reply', 'video', 'image' -> get from post
-  data_language: z.string().optional(), // detected language of the text -> get from post or processor
-  data_sentiment: z.string().optional(), // sentiment analysis result -> get from processor
-  data_timestamp: z.number().optional(), // original post timestamp if different from ingestion timestamp -> get from post
-  is_reply: z.boolean().optional(), // true if this text is a reply to another text -> get from post
-  reply_to_message_id: z.string().or(z.number()).optional(), // ID of the message this is a reply to -> get from post
-  metadata: z.record(z.string(), z.any()).optional(), // platform-specific fields -> get from post
+  // Timestamps
+  timestamp: z.number().optional(), // INDEX - ingestion timestamp
+  data_timestamp: z.number().optional(), // original post timestamp
   created_at: z.number(),
   updated_at: z.number(),
-  media: z
-    .array(
-      z.object({
-        type: z.enum(['image', 'video', 'audio', 'link']),
-        url: z.string().url(),
-        caption: z.string().optional(),
-      })
-    )
-    .optional(), // media attachments -> get from post
-  author: z.string().optional(), // e.g., author name or ID -> get from post
-  author_username: z.string().optional(), // e.g., author username -> get from post
-  replies: z.any().optional(), // array of reply texts or IDs -> get from post
-  author_id: z.string().optional(), // e.g., author ID -> get from post
-  // translated_text: z.string().optional(), // translated text if available - most of the time it will be translated to English
-  // entities: z.any().optional(), // array of reply texts or IDs
-  // source_public_id: z.string().or(z.number()), // e.g., 'telegram:1234567890' (message_id)
-  // platform_id: z.string().or(z.number()), // channel_id reference to Source source_id
-  // original_text_id: z.string().or(z.number()), // message_id
+  // Platform & Source Info (denormalized for query performance)
+  platform: z.enum(platformsList).optional(),
+  source_id: z.string().or(z.number()), // INDEX - reference to Source _id
+  source_title: z.string().optional(),
+  source_url: z.string().url().optional(),
+  source_region_id: z.string().optional(), // INDEX
+  source_region_title: z.string().optional(),
+  source_group_id: z.string().optional(), // INDEX
+  source_dominant_geos: z.array(z.string()).optional().default([]),
+  // Data/Post Identifiers
+  data_id: z.string().or(z.number()).optional(), // INDEX - platform-specific post ID
+  data_url: z.string().url().optional(), // direct link to the post
+  data_original_type: z.enum([
+    'post',
+    'comment',
+    'reply',
+    'video',
+    'image',
+    'photo',
+    'story',
+    'reel',
+    'article',
+    'link',
+    'document',
+  ]).optional(),
+  // Content
+  data_text: z.string().optional(),
+  data_language: z.string().optional(), // ISO 639-1 code (e.g., 'en', 'ar')
+  data_geo: z.array(z.string()).optional().default([]), // INDEX - extracted locations
+  // Analysis (populated by processors)
+  data_sentiment: z.enum(['positive', 'negative', 'neutral', 'mixed']).optional(),
+  data_topics: z.array(z.string()).optional(), // extracted topics/themes
+  data_keywords: z.array(z.string()).optional(), // extracted keywords
+  // Media attachments
+  media: z.array(mediaItemSchema).optional().default([]),
+  // Author information
+  author: z.string().optional(), // legacy field - display name
+  author_username: z.string().optional(), // legacy field
+  author_id: z.string().optional(), // legacy field
+  author_info: authorSchema.optional(), // NEW - structured author info
+  // Reply/Thread information
+  is_reply: z.boolean().optional().default(false),
+  reply_to_message_id: z.string().or(z.number()).optional(),
+  reply_to_author_id: z.string().optional(),
+  replies: z.any().optional(), // legacy field - keep for backward compatibility
+  replies_info: repliesInfoSchema.optional(), // NEW - structured replies info
+  // Engagement metrics (platform-specific)
+  engagement: z.object({
+    views: z.number().optional(),
+    likes: z.number().optional(),
+    shares: z.number().optional(),
+    comments: z.number().optional(),
+    reactions: z.record(z.string(), z.number()).optional(), // e.g., { "like": 10, "love": 5 }
+  }).optional(),
+  // Content flags
+  is_edited: z.boolean().optional(),
+  edit_date: z.number().optional(),
+  is_pinned: z.boolean().optional(),
+  is_deleted: z.boolean().optional(),
+  is_forwarded: z.boolean().optional(),
+  forwarded_from_id: z.string().optional(),
+  forwarded_date: z.number().optional(),
+  // Hashtags and mentions (extracted for easier querying)
+  hashtags: z.array(z.string()).optional().default([]),
+  mentions: z.array(z.string()).optional().default([]),
+  // Platform-specific metadata (flexible)
+  metadata: z.union([
+    telegramMetadataSchema,
+    facebookMetadataSchema,
+    instagramMetadataSchema,
+    tiktokMetadataSchema,
+    websiteMetadataSchema,
+    z.record(z.string(), z.any()), // fallback for unknown platforms
+  ]).optional(),
+  processing_errors: z.string().optional(),
 });
+// Add validation refinement for platform-specific fields
+export const zodDataSchemaWithValidation = zodDataSchema.refine(
+  (data) => {
+    // Ensure data_id exists for most platforms except websites
+    if (data.platform && data.platform !== 'website' && !data.data_id) {
+      return false;
+    }
+    return true;
+  },
+  {
+    message: "data_id is required for non-website platforms",
+    path: ["data_id"],
+  }
+);
 export type RegionType = z.infer<typeof zodRegionSchema>;
 export type SourceGroupType = z.infer<typeof zodSourceGroupSchema>;
 export type SourceType = z.infer<typeof zodSourceSchema>;
 export type DataType = z.infer<typeof zodDataSchema>;
-export type SourceStatusType = (typeof sourceStatusList)[number];
-export type StatusType = (typeof generalStatusList)[number];
-export type TimeRangeTypeLiteral = 'relative' | 'absolute';
-export type AddSourceToReviewType = Pick<
-  SourceType,
-  'platform' | 'url' | 'description'
-> &
-  Partial<Pick<SourceType, 'public_id'>>;
-export type GeoType = z.infer<typeof zodGeoSchema>;
-export type GeoSelectionType = z.infer<typeof zodGeoSelectionSchema>;
+export type MediaItemType = z.infer<typeof mediaItemSchema>;
+export type AuthorType = z.infer<typeof authorSchema>;
+export type RepliesInfoType = z.infer<typeof repliesInfoSchema>;
+// Platform-specific metadata types
+export type TelegramMetadataType = z.infer<typeof telegramMetadataSchema>;
+export type FacebookMetadataType = z.infer<typeof facebookMetadataSchema>;
+export type InstagramMetadataType = z.infer<typeof instagramMetadataSchema>;
+export type TiktokMetadataType = z.infer<typeof tiktokMetadataSchema>;
+export type WebsiteMetadataType = z.infer<typeof websiteMetadataSchema>;
 // Helper type to get allowed entities for a specific platform
 export type PlatformEntityType<T extends (typeof platformsList)[number]> =