npm - harvester_sdk - Versions diffs - 1.0.9 → 1.0.11 - Mend

harvester_sdk 1.0.9 → 1.0.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/dist/types.js CHANGED Viewed

@@ -1,6 +1,6 @@
 "use strict";
 Object.defineProperty(exports, "__esModule", { value: true });
-exports.getAllowedEntitiesForPlatform = exports.zodDataSchema = exports.zodGeoSelectionSchema = exports.zodGeoSchema = exports.zodSourceSchema = exports.zodSourceGroupSchema = exports.zodRegionSchema = exports.generalStatusList = exports.sourceStatusList = exports.platformEntityMap = exports.entityTypesList = exports.platformsList = void 0;
+exports.getAllowedEntitiesForPlatform = exports.zodDataSchemaWithValidation = exports.zodDataSchema = exports.authorSchema = exports.repliesInfoSchema = exports.mediaItemSchema = exports.websiteMetadataSchema = exports.tiktokMetadataSchema = exports.instagramMetadataSchema = exports.facebookMetadataSchema = exports.telegramMetadataSchema = exports.zodGeoSelectionSchema = exports.zodGeoSchema = exports.zodSourceSchema = exports.zodSourceGroupSchema = exports.zodRegionSchema = exports.generalStatusList = exports.sourceStatusList = exports.platformEntityMap = exports.entityTypesList = exports.platformsList = void 0;
 const zod_1 = require("zod");
 exports.platformsList = [
     'telegram',
@@ -105,46 +105,186 @@ exports.zodGeoSelectionSchema = zod_1.z.object({
     created_at: zod_1.z.number().optional(),
     updated_at: zod_1.z.number().optional(), // last update date
 });
+// Add specific metadata schemas for better type safety while keeping flexibility
+exports.telegramMetadataSchema = zod_1.z.object({
+    channel_id: zod_1.z.string(),
+    views: zod_1.z.number().optional(),
+    forwards: zod_1.z.number().optional(),
+    reactions: zod_1.z.array(zod_1.z.any()).optional(),
+    hashtags: zod_1.z.array(zod_1.z.string()).optional(),
+    mentions: zod_1.z.array(zod_1.z.string()).optional(),
+    is_pinned: zod_1.z.boolean().optional(),
+    is_edited: zod_1.z.boolean().optional(),
+    edit_date: zod_1.z.number().optional(),
+    post_author: zod_1.z.string().nullable(),
+    via_bot_id: zod_1.z.string().optional(),
+    grouped_id: zod_1.z.string().optional(),
+    is_silent: zod_1.z.boolean().optional(),
+    forwarded_from_id: zod_1.z.string().optional(),
+    forwarded_date: zod_1.z.number().optional(),
+});
+exports.facebookMetadataSchema = zod_1.z.object({
+    post_id: zod_1.z.string(),
+    likes: zod_1.z.number().optional(),
+    shares: zod_1.z.number().optional(),
+    comments_count: zod_1.z.number().optional(),
+    reactions: zod_1.z.record(zod_1.z.string(), zod_1.z.number()).optional(),
+    is_pinned: zod_1.z.boolean().optional(),
+    is_edited: zod_1.z.boolean().optional(),
+    edit_date: zod_1.z.number().optional(),
+});
+exports.instagramMetadataSchema = zod_1.z.object({
+    post_id: zod_1.z.string(),
+    likes: zod_1.z.number().optional(),
+    comments_count: zod_1.z.number().optional(),
+    hashtags: zod_1.z.array(zod_1.z.string()).optional(),
+    mentions: zod_1.z.array(zod_1.z.string()).optional(),
+    location: zod_1.z.string().optional(),
+    is_reel: zod_1.z.boolean().optional(),
+    is_story: zod_1.z.boolean().optional(),
+});
+exports.tiktokMetadataSchema = zod_1.z.object({
+    video_id: zod_1.z.string(),
+    likes: zod_1.z.number().optional(),
+    shares: zod_1.z.number().optional(),
+    comments_count: zod_1.z.number().optional(),
+    views: zod_1.z.number().optional(),
+    hashtags: zod_1.z.array(zod_1.z.string()).optional(),
+    mentions: zod_1.z.array(zod_1.z.string()).optional(),
+    sound_name: zod_1.z.string().optional(),
+    sound_id: zod_1.z.string().optional(),
+    duration: zod_1.z.number().optional(),
+});
+exports.websiteMetadataSchema = zod_1.z.object({
+    article_id: zod_1.z.string().optional(),
+    url: zod_1.z.string().url(),
+    title: zod_1.z.string().optional(),
+    author: zod_1.z.string().optional(),
+    publish_date: zod_1.z.number().optional(),
+    category: zod_1.z.string().optional(),
+    tags: zod_1.z.array(zod_1.z.string()).optional(),
+});
+// Improved media schema with more types and optional dimensions
+exports.mediaItemSchema = zod_1.z.object({
+    type: zod_1.z.enum(['image', 'video', 'audio', 'link', 'document', 'gif', 'sticker']),
+    url: zod_1.z.string(),
+    caption: zod_1.z.string().optional(),
+    thumbnail_url: zod_1.z.string().optional(),
+    width: zod_1.z.number().optional(),
+    height: zod_1.z.number().optional(),
+    duration: zod_1.z.number().optional(),
+    size: zod_1.z.number().optional(),
+    mime_type: zod_1.z.string().optional(),
+});
+// Improved replies schema
+exports.repliesInfoSchema = zod_1.z.object({
+    count: zod_1.z.number().default(0),
+    recent_repliers: zod_1.z.array(zod_1.z.string()).optional(),
+    has_thread: zod_1.z.boolean().optional(),
+    thread_id: zod_1.z.string().optional(),
+});
+// Improved author schema for better consistency
+exports.authorSchema = zod_1.z.object({
+    id: zod_1.z.string().optional(),
+    username: zod_1.z.string().optional(),
+    display_name: zod_1.z.string().optional(),
+    avatar_url: zod_1.z.string().optional(),
+    is_verified: zod_1.z.boolean().optional(),
+    follower_count: zod_1.z.number().optional(),
+});
 exports.zodDataSchema = zod_1.z.object({
     _id: zod_1.z.string().optional(),
+    // Timestamps
     timestamp: zod_1.z.number().optional(),
+    data_timestamp: zod_1.z.number().optional(),
+    created_at: zod_1.z.number(),
+    updated_at: zod_1.z.number(),
+    // Platform & Source Info (denormalized for query performance)
     platform: zod_1.z.enum(exports.platformsList).optional(),
-    source_region_id: zod_1.z.string().optional(),
-    source_region_title: zod_1.z.string().optional(),
-    source_dominant_geos: zod_1.z.array(zod_1.z.string()).optional(),
     source_id: zod_1.z.string().or(zod_1.z.number()),
     source_title: zod_1.z.string().optional(),
     source_url: zod_1.z.string().url().optional(),
+    source_region_id: zod_1.z.string().optional(),
+    source_region_title: zod_1.z.string().optional(),
     source_group_id: zod_1.z.string().optional(),
+    source_dominant_geos: zod_1.z.array(zod_1.z.string()).optional().default([]),
+    // Data/Post Identifiers
     data_id: zod_1.z.string().or(zod_1.z.number()).optional(),
-    data_geo: zod_1.z.array(zod_1.z.string()).optional(),
-    data_text: zod_1.z.string().optional(),
     data_url: zod_1.z.string().url().optional(),
-    data_original_type: zod_1.z.string().optional(),
+    data_original_type: zod_1.z.enum([
+        'post',
+        'comment',
+        'reply',
+        'video',
+        'image',
+        'photo',
+        'story',
+        'reel',
+        'article',
+        'link',
+        'document',
+    ]).optional(),
+    // Content
+    data_text: zod_1.z.string().optional(),
     data_language: zod_1.z.string().optional(),
-    data_sentiment: zod_1.z.string().optional(),
-    data_timestamp: zod_1.z.number().optional(),
-    is_reply: zod_1.z.boolean().optional(),
-    reply_to_message_id: zod_1.z.string().or(zod_1.z.number()).optional(),
-    metadata: zod_1.z.record(zod_1.z.string(), zod_1.z.any()).optional(),
-    created_at: zod_1.z.number(),
-    updated_at: zod_1.z.number(),
-    media: zod_1.z
-        .array(zod_1.z.object({
-        type: zod_1.z.enum(['image', 'video', 'audio', 'link']),
-        url: zod_1.z.string().url(),
-        caption: zod_1.z.string().optional(),
-    }))
-        .optional(),
+    data_geo: zod_1.z.array(zod_1.z.string()).optional().default([]),
+    // Analysis (populated by processors)
+    data_sentiment: zod_1.z.enum(['positive', 'negative', 'neutral', 'mixed']).optional(),
+    data_topics: zod_1.z.array(zod_1.z.string()).optional(),
+    data_keywords: zod_1.z.array(zod_1.z.string()).optional(),
+    // Media attachments
+    media: zod_1.z.array(exports.mediaItemSchema).optional().default([]),
+    // Author information
     author: zod_1.z.string().optional(),
     author_username: zod_1.z.string().optional(),
+    author_id: zod_1.z.string().optional(),
+    author_info: exports.authorSchema.optional(),
+    // Reply/Thread information
+    is_reply: zod_1.z.boolean().optional().default(false),
+    reply_to_message_id: zod_1.z.string().or(zod_1.z.number()).optional(),
+    reply_to_author_id: zod_1.z.string().optional(),
     replies: zod_1.z.any().optional(),
-    author_id: zod_1.z.string().optional(), // e.g., author ID -> get from post
-    // translated_text: z.string().optional(), // translated text if available - most of the time it will be translated to English
-    // entities: z.any().optional(), // array of reply texts or IDs
-    // source_public_id: z.string().or(z.number()), // e.g., 'telegram:1234567890' (message_id)
-    // platform_id: z.string().or(z.number()), // channel_id reference to Source source_id
-    // original_text_id: z.string().or(z.number()), // message_id
+    replies_info: exports.repliesInfoSchema.optional(),
+    // Engagement metrics (platform-specific)
+    engagement: zod_1.z.object({
+        views: zod_1.z.number().optional(),
+        likes: zod_1.z.number().optional(),
+        shares: zod_1.z.number().optional(),
+        comments: zod_1.z.number().optional(),
+        reactions: zod_1.z.record(zod_1.z.string(), zod_1.z.number()).optional(), // e.g., { "like": 10, "love": 5 }
+    }).optional(),
+    // Content flags
+    is_edited: zod_1.z.boolean().optional(),
+    edit_date: zod_1.z.number().optional(),
+    is_pinned: zod_1.z.boolean().optional(),
+    is_deleted: zod_1.z.boolean().optional(),
+    is_forwarded: zod_1.z.boolean().optional(),
+    forwarded_from_id: zod_1.z.string().optional(),
+    forwarded_date: zod_1.z.number().optional(),
+    // Hashtags and mentions (extracted for easier querying)
+    hashtags: zod_1.z.array(zod_1.z.string()).optional().default([]),
+    mentions: zod_1.z.array(zod_1.z.string()).optional().default([]),
+    // Platform-specific metadata (flexible)
+    metadata: zod_1.z.union([
+        exports.telegramMetadataSchema,
+        exports.facebookMetadataSchema,
+        exports.instagramMetadataSchema,
+        exports.tiktokMetadataSchema,
+        exports.websiteMetadataSchema,
+        zod_1.z.record(zod_1.z.string(), zod_1.z.any()), // fallback for unknown platforms
+    ]).optional(),
+    processing_errors: zod_1.z.string().optional(),
+});
+// Add validation refinement for platform-specific fields
+exports.zodDataSchemaWithValidation = exports.zodDataSchema.refine((data) => {
+    // Ensure data_id exists for most platforms except websites
+    if (data.platform && data.platform !== 'website' && !data.data_id) {
+        return false;
+    }
+    return true;
+}, {
+    message: "data_id is required for non-website platforms",
+    path: ["data_id"],
 });
 // Helper function to get allowed entities for a platform
 const getAllowedEntitiesForPlatform = (platform) => {

package/index.ts CHANGED Viewed

@@ -11,44 +11,146 @@ import {
 export const MongoDataSchema = new Schema(
   {
-    timestamp: { type: Number }, // INDEX - date in milliseconds
+    // Timestamps
+    timestamp: { type: Number }, // INDEX - ingestion timestamp
+    data_timestamp: { type: Number }, // original post timestamp
+    created_at: { type: Number, default: Date.now, required: true },
+    updated_at: { type: Number, default: Date.now, required: true },
+    // Platform & Source Info (denormalized for query performance)
     platform: {
       type: String,
       enum: platformsList,
-    }, // e.g., 'telegram', 'facebook'
-    source_region_id: { type: String }, // e.g., 'hebron'
-    source_region_title: { type: String }, // e.g., 'hebron'
-    source_dominant_geos: { type: [String], default: [] }, // e.g., ['hebron', 'west bank']
+    },
     source_id: { type: Schema.Types.Mixed, required: true }, // INDEX - reference to Source _id
-    source_title: { type: String }, // e.g., 'Telegram Channel Name'
-    source_url: { type: String }, // e.g., 'https://t.me/telegram_channel_name'
-    source_group_id: { type: String }, // e.g., source group ID
-    data_id: { type: Schema.Types.Mixed }, // INDEX - original text ID (e.g., message_id)
-    data_geo: { type: [String], default: [] }, // INDEX - e.g., ["sinjil", "ramallah", "west bank"]
-    data_text: { type: String }, // processed text content
-    data_url: { type: String }, // original text URL if available
-    data_original_type: { type: String }, // e.g., 'post', 'comment', 'reply', 'video', 'image'
-    data_language: { type: String }, // detected language of the text
-    data_sentiment: { type: String }, // sentiment analysis result
-    data_timestamp: { type: Number }, // original post timestamp if different from ingestion timestamp
-    is_reply: { type: Boolean }, // true if this text is a reply to another text
-    reply_to_message_id: { type: Schema.Types.Mixed },
-    metadata: { type: Object }, // platform-specific fields
-    created_at: { type: Number, default: Date.now, required: true },
-    updated_at: { type: Number, default: Date.now, required: true },
+    source_title: { type: String },
+    source_url: { type: String },
+    source_region_id: { type: String }, // INDEX
+    source_region_title: { type: String },
+    source_group_id: { type: String }, // INDEX
+    source_dominant_geos: { type: [String], default: [] },
+    // Data/Post Identifiers
+    data_id: { type: Schema.Types.Mixed }, // INDEX - platform-specific post ID
+    data_url: { type: String }, // direct link to the post
+    data_original_type: {
+      type: String,
+      enum: [
+        'post',
+        'comment',
+        'reply',
+        'video',
+        'image',
+        'photo',
+        'story',
+        'reel',
+        'article',
+        'link',
+        'document',
+      ],
+    },
+    // Content
+    data_text: { type: String },
+    data_language: { type: String }, // ISO 639-1 code (e.g., 'en', 'ar')
+    data_geo: { type: [String], default: [] }, // INDEX - extracted locations
+    // Analysis (populated by processors)
+    data_sentiment: {
+      type: String,
+      enum: ['positive', 'negative', 'neutral', 'mixed'],
+    },
+    data_topics: { type: [String], default: [] }, // extracted topics/themes
+    data_keywords: { type: [String], default: [] }, // extracted keywords
+    // Media attachments
     media: {
       type: [
         {
-          type: { type: String, enum: ['image', 'video', 'audio', 'link'] },
+          type: {
+            type: String,
+            enum: [
+              'image',
+              'video',
+              'audio',
+              'link',
+              'document',
+              'gif',
+              'sticker',
+            ],
+          },
           url: { type: String },
           caption: { type: String },
+          thumbnail_url: { type: String },
+          width: { type: Number },
+          height: { type: Number },
+          duration: { type: Number }, // for video/audio in seconds
+          size: { type: Number }, // file size in bytes
+          mime_type: { type: String },
         },
       ],
-    }, // media attachments
-    author: { type: String }, // e.g., author name or ID
-    author_username: { type: String }, // e.g., author username
-    replies: { type: Schema.Types.Mixed }, // array of reply texts or IDs
-    author_id: { type: String }, // e.g., author ID
+      default: [],
+    },
+    // Author information (legacy fields for backward compatibility)
+    author: { type: String }, // legacy - display name
+    author_username: { type: String }, // legacy
+    author_id: { type: String }, // legacy
+    // NEW - structured author info
+    author_info: {
+      type: {
+        id: { type: String },
+        username: { type: String },
+        display_name: { type: String },
+        avatar_url: { type: String },
+        is_verified: { type: Boolean },
+        follower_count: { type: Number },
+      },
+    },
+    // Reply/Thread information
+    is_reply: { type: Boolean, default: false },
+    reply_to_message_id: { type: Schema.Types.Mixed },
+    reply_to_author_id: { type: String },
+    replies: { type: Schema.Types.Mixed }, // legacy field - keep for backward compatibility
+    // NEW - structured replies info
+    replies_info: {
+      type: {
+        count: { type: Number, default: 0 },
+        recent_repliers: { type: [String] },
+        has_thread: { type: Boolean },
+        thread_id: { type: String },
+      },
+    },
+    // Engagement metrics (platform-specific)
+    engagement: {
+      type: {
+        views: { type: Number },
+        likes: { type: Number },
+        shares: { type: Number },
+        comments: { type: Number },
+        reactions: { type: Map, of: Number }, // e.g., { "like": 10, "love": 5 }
+      },
+    },
+    // Content flags
+    is_edited: { type: Boolean },
+    edit_date: { type: Number },
+    is_pinned: { type: Boolean },
+    is_deleted: { type: Boolean },
+    is_forwarded: { type: Boolean },
+    forwarded_from_id: { type: String },
+    forwarded_date: { type: Number },
+    // Hashtags and mentions (extracted for easier querying)
+    hashtags: { type: [String], default: [] },
+    mentions: { type: [String], default: [] },
+    // Platform-specific metadata (flexible)
+    metadata: { type: Object },
+    processing_errors: { type: String },
   },
   {
     versionKey: false,

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "harvester_sdk",
-  "version": "1.0.9",
+  "version": "1.0.11",
   "description": "SDK for interacting with the Harvester API",
   "main": "dist/index.js",
   "types": "dist/index.d.ts",