harvester_sdk 1.0.9 → 1.0.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/types.js CHANGED
@@ -1,6 +1,6 @@
1
1
  "use strict";
2
2
  Object.defineProperty(exports, "__esModule", { value: true });
3
- exports.getAllowedEntitiesForPlatform = exports.zodDataSchema = exports.zodGeoSelectionSchema = exports.zodGeoSchema = exports.zodSourceSchema = exports.zodSourceGroupSchema = exports.zodRegionSchema = exports.generalStatusList = exports.sourceStatusList = exports.platformEntityMap = exports.entityTypesList = exports.platformsList = void 0;
3
+ exports.getAllowedEntitiesForPlatform = exports.zodDataSchemaWithValidation = exports.zodDataSchema = exports.authorSchema = exports.repliesInfoSchema = exports.mediaItemSchema = exports.websiteMetadataSchema = exports.tiktokMetadataSchema = exports.instagramMetadataSchema = exports.facebookMetadataSchema = exports.telegramMetadataSchema = exports.zodGeoSelectionSchema = exports.zodGeoSchema = exports.zodSourceSchema = exports.zodSourceGroupSchema = exports.zodRegionSchema = exports.generalStatusList = exports.sourceStatusList = exports.platformEntityMap = exports.entityTypesList = exports.platformsList = void 0;
4
4
  const zod_1 = require("zod");
5
5
  exports.platformsList = [
6
6
  'telegram',
@@ -105,46 +105,186 @@ exports.zodGeoSelectionSchema = zod_1.z.object({
105
105
  created_at: zod_1.z.number().optional(),
106
106
  updated_at: zod_1.z.number().optional(), // last update date
107
107
  });
108
+ // Add specific metadata schemas for better type safety while keeping flexibility
109
+ exports.telegramMetadataSchema = zod_1.z.object({
110
+ channel_id: zod_1.z.string(),
111
+ views: zod_1.z.number().optional(),
112
+ forwards: zod_1.z.number().optional(),
113
+ reactions: zod_1.z.array(zod_1.z.any()).optional(),
114
+ hashtags: zod_1.z.array(zod_1.z.string()).optional(),
115
+ mentions: zod_1.z.array(zod_1.z.string()).optional(),
116
+ is_pinned: zod_1.z.boolean().optional(),
117
+ is_edited: zod_1.z.boolean().optional(),
118
+ edit_date: zod_1.z.number().optional(),
119
+ post_author: zod_1.z.string().nullable(),
120
+ via_bot_id: zod_1.z.string().optional(),
121
+ grouped_id: zod_1.z.string().optional(),
122
+ is_silent: zod_1.z.boolean().optional(),
123
+ forwarded_from_id: zod_1.z.string().optional(),
124
+ forwarded_date: zod_1.z.number().optional(),
125
+ });
126
+ exports.facebookMetadataSchema = zod_1.z.object({
127
+ post_id: zod_1.z.string(),
128
+ likes: zod_1.z.number().optional(),
129
+ shares: zod_1.z.number().optional(),
130
+ comments_count: zod_1.z.number().optional(),
131
+ reactions: zod_1.z.record(zod_1.z.string(), zod_1.z.number()).optional(),
132
+ is_pinned: zod_1.z.boolean().optional(),
133
+ is_edited: zod_1.z.boolean().optional(),
134
+ edit_date: zod_1.z.number().optional(),
135
+ });
136
+ exports.instagramMetadataSchema = zod_1.z.object({
137
+ post_id: zod_1.z.string(),
138
+ likes: zod_1.z.number().optional(),
139
+ comments_count: zod_1.z.number().optional(),
140
+ hashtags: zod_1.z.array(zod_1.z.string()).optional(),
141
+ mentions: zod_1.z.array(zod_1.z.string()).optional(),
142
+ location: zod_1.z.string().optional(),
143
+ is_reel: zod_1.z.boolean().optional(),
144
+ is_story: zod_1.z.boolean().optional(),
145
+ });
146
+ exports.tiktokMetadataSchema = zod_1.z.object({
147
+ video_id: zod_1.z.string(),
148
+ likes: zod_1.z.number().optional(),
149
+ shares: zod_1.z.number().optional(),
150
+ comments_count: zod_1.z.number().optional(),
151
+ views: zod_1.z.number().optional(),
152
+ hashtags: zod_1.z.array(zod_1.z.string()).optional(),
153
+ mentions: zod_1.z.array(zod_1.z.string()).optional(),
154
+ sound_name: zod_1.z.string().optional(),
155
+ sound_id: zod_1.z.string().optional(),
156
+ duration: zod_1.z.number().optional(),
157
+ });
158
+ exports.websiteMetadataSchema = zod_1.z.object({
159
+ article_id: zod_1.z.string().optional(),
160
+ url: zod_1.z.string().url(),
161
+ title: zod_1.z.string().optional(),
162
+ author: zod_1.z.string().optional(),
163
+ publish_date: zod_1.z.number().optional(),
164
+ category: zod_1.z.string().optional(),
165
+ tags: zod_1.z.array(zod_1.z.string()).optional(),
166
+ });
167
+ // Improved media schema with more types and optional dimensions
168
+ exports.mediaItemSchema = zod_1.z.object({
169
+ type: zod_1.z.enum(['image', 'video', 'audio', 'link', 'document', 'gif', 'sticker']),
170
+ url: zod_1.z.string(),
171
+ caption: zod_1.z.string().optional(),
172
+ thumbnail_url: zod_1.z.string().optional(),
173
+ width: zod_1.z.number().optional(),
174
+ height: zod_1.z.number().optional(),
175
+ duration: zod_1.z.number().optional(),
176
+ size: zod_1.z.number().optional(),
177
+ mime_type: zod_1.z.string().optional(),
178
+ });
179
+ // Improved replies schema
180
+ exports.repliesInfoSchema = zod_1.z.object({
181
+ count: zod_1.z.number().default(0),
182
+ recent_repliers: zod_1.z.array(zod_1.z.string()).optional(),
183
+ has_thread: zod_1.z.boolean().optional(),
184
+ thread_id: zod_1.z.string().optional(),
185
+ });
186
+ // Improved author schema for better consistency
187
+ exports.authorSchema = zod_1.z.object({
188
+ id: zod_1.z.string().optional(),
189
+ username: zod_1.z.string().optional(),
190
+ display_name: zod_1.z.string().optional(),
191
+ avatar_url: zod_1.z.string().optional(),
192
+ is_verified: zod_1.z.boolean().optional(),
193
+ follower_count: zod_1.z.number().optional(),
194
+ });
108
195
  exports.zodDataSchema = zod_1.z.object({
109
196
  _id: zod_1.z.string().optional(),
197
+ // Timestamps
110
198
  timestamp: zod_1.z.number().optional(),
199
+ data_timestamp: zod_1.z.number().optional(),
200
+ created_at: zod_1.z.number(),
201
+ updated_at: zod_1.z.number(),
202
+ // Platform & Source Info (denormalized for query performance)
111
203
  platform: zod_1.z.enum(exports.platformsList).optional(),
112
- source_region_id: zod_1.z.string().optional(),
113
- source_region_title: zod_1.z.string().optional(),
114
- source_dominant_geos: zod_1.z.array(zod_1.z.string()).optional(),
115
204
  source_id: zod_1.z.string().or(zod_1.z.number()),
116
205
  source_title: zod_1.z.string().optional(),
117
206
  source_url: zod_1.z.string().url().optional(),
207
+ source_region_id: zod_1.z.string().optional(),
208
+ source_region_title: zod_1.z.string().optional(),
118
209
  source_group_id: zod_1.z.string().optional(),
210
+ source_dominant_geos: zod_1.z.array(zod_1.z.string()).optional().default([]),
211
+ // Data/Post Identifiers
119
212
  data_id: zod_1.z.string().or(zod_1.z.number()).optional(),
120
- data_geo: zod_1.z.array(zod_1.z.string()).optional(),
121
- data_text: zod_1.z.string().optional(),
122
213
  data_url: zod_1.z.string().url().optional(),
123
- data_original_type: zod_1.z.string().optional(),
214
+ data_original_type: zod_1.z.enum([
215
+ 'post',
216
+ 'comment',
217
+ 'reply',
218
+ 'video',
219
+ 'image',
220
+ 'photo',
221
+ 'story',
222
+ 'reel',
223
+ 'article',
224
+ 'link',
225
+ 'document',
226
+ ]).optional(),
227
+ // Content
228
+ data_text: zod_1.z.string().optional(),
124
229
  data_language: zod_1.z.string().optional(),
125
- data_sentiment: zod_1.z.string().optional(),
126
- data_timestamp: zod_1.z.number().optional(),
127
- is_reply: zod_1.z.boolean().optional(),
128
- reply_to_message_id: zod_1.z.string().or(zod_1.z.number()).optional(),
129
- metadata: zod_1.z.record(zod_1.z.string(), zod_1.z.any()).optional(),
130
- created_at: zod_1.z.number(),
131
- updated_at: zod_1.z.number(),
132
- media: zod_1.z
133
- .array(zod_1.z.object({
134
- type: zod_1.z.enum(['image', 'video', 'audio', 'link']),
135
- url: zod_1.z.string().url(),
136
- caption: zod_1.z.string().optional(),
137
- }))
138
- .optional(),
230
+ data_geo: zod_1.z.array(zod_1.z.string()).optional().default([]),
231
+ // Analysis (populated by processors)
232
+ data_sentiment: zod_1.z.enum(['positive', 'negative', 'neutral', 'mixed']).optional(),
233
+ data_topics: zod_1.z.array(zod_1.z.string()).optional(),
234
+ data_keywords: zod_1.z.array(zod_1.z.string()).optional(),
235
+ // Media attachments
236
+ media: zod_1.z.array(exports.mediaItemSchema).optional().default([]),
237
+ // Author information
139
238
  author: zod_1.z.string().optional(),
140
239
  author_username: zod_1.z.string().optional(),
240
+ author_id: zod_1.z.string().optional(),
241
+ author_info: exports.authorSchema.optional(),
242
+ // Reply/Thread information
243
+ is_reply: zod_1.z.boolean().optional().default(false),
244
+ reply_to_message_id: zod_1.z.string().or(zod_1.z.number()).optional(),
245
+ reply_to_author_id: zod_1.z.string().optional(),
141
246
  replies: zod_1.z.any().optional(),
142
- author_id: zod_1.z.string().optional(), // e.g., author ID -> get from post
143
- // translated_text: z.string().optional(), // translated text if available - most of the time it will be translated to English
144
- // entities: z.any().optional(), // array of reply texts or IDs
145
- // source_public_id: z.string().or(z.number()), // e.g., 'telegram:1234567890' (message_id)
146
- // platform_id: z.string().or(z.number()), // channel_id reference to Source source_id
147
- // original_text_id: z.string().or(z.number()), // message_id
247
+ replies_info: exports.repliesInfoSchema.optional(),
248
+ // Engagement metrics (platform-specific)
249
+ engagement: zod_1.z.object({
250
+ views: zod_1.z.number().optional(),
251
+ likes: zod_1.z.number().optional(),
252
+ shares: zod_1.z.number().optional(),
253
+ comments: zod_1.z.number().optional(),
254
+ reactions: zod_1.z.record(zod_1.z.string(), zod_1.z.number()).optional(), // e.g., { "like": 10, "love": 5 }
255
+ }).optional(),
256
+ // Content flags
257
+ is_edited: zod_1.z.boolean().optional(),
258
+ edit_date: zod_1.z.number().optional(),
259
+ is_pinned: zod_1.z.boolean().optional(),
260
+ is_deleted: zod_1.z.boolean().optional(),
261
+ is_forwarded: zod_1.z.boolean().optional(),
262
+ forwarded_from_id: zod_1.z.string().optional(),
263
+ forwarded_date: zod_1.z.number().optional(),
264
+ // Hashtags and mentions (extracted for easier querying)
265
+ hashtags: zod_1.z.array(zod_1.z.string()).optional().default([]),
266
+ mentions: zod_1.z.array(zod_1.z.string()).optional().default([]),
267
+ // Platform-specific metadata (flexible)
268
+ metadata: zod_1.z.union([
269
+ exports.telegramMetadataSchema,
270
+ exports.facebookMetadataSchema,
271
+ exports.instagramMetadataSchema,
272
+ exports.tiktokMetadataSchema,
273
+ exports.websiteMetadataSchema,
274
+ zod_1.z.record(zod_1.z.string(), zod_1.z.any()), // fallback for unknown platforms
275
+ ]).optional(),
276
+ processing_errors: zod_1.z.string().optional(),
277
+ });
278
+ // Add validation refinement for platform-specific fields
279
+ exports.zodDataSchemaWithValidation = exports.zodDataSchema.refine((data) => {
280
+ // Ensure data_id exists for most platforms except websites
281
+ if (data.platform && data.platform !== 'website' && !data.data_id) {
282
+ return false;
283
+ }
284
+ return true;
285
+ }, {
286
+ message: "data_id is required for non-website platforms",
287
+ path: ["data_id"],
148
288
  });
149
289
  // Helper function to get allowed entities for a platform
150
290
  const getAllowedEntitiesForPlatform = (platform) => {
package/index.ts CHANGED
@@ -11,44 +11,146 @@ import {
11
11
 
12
12
  export const MongoDataSchema = new Schema(
13
13
  {
14
- timestamp: { type: Number }, // INDEX - date in milliseconds
14
+ // Timestamps
15
+ timestamp: { type: Number }, // INDEX - ingestion timestamp
16
+ data_timestamp: { type: Number }, // original post timestamp
17
+ created_at: { type: Number, default: Date.now, required: true },
18
+ updated_at: { type: Number, default: Date.now, required: true },
19
+
20
+ // Platform & Source Info (denormalized for query performance)
15
21
  platform: {
16
22
  type: String,
17
23
  enum: platformsList,
18
- }, // e.g., 'telegram', 'facebook'
19
- source_region_id: { type: String }, // e.g., 'hebron'
20
- source_region_title: { type: String }, // e.g., 'hebron'
21
- source_dominant_geos: { type: [String], default: [] }, // e.g., ['hebron', 'west bank']
24
+ },
22
25
  source_id: { type: Schema.Types.Mixed, required: true }, // INDEX - reference to Source _id
23
- source_title: { type: String }, // e.g., 'Telegram Channel Name'
24
- source_url: { type: String }, // e.g., 'https://t.me/telegram_channel_name'
25
- source_group_id: { type: String }, // e.g., source group ID
26
- data_id: { type: Schema.Types.Mixed }, // INDEX - original text ID (e.g., message_id)
27
- data_geo: { type: [String], default: [] }, // INDEX - e.g., ["sinjil", "ramallah", "west bank"]
28
- data_text: { type: String }, // processed text content
29
- data_url: { type: String }, // original text URL if available
30
- data_original_type: { type: String }, // e.g., 'post', 'comment', 'reply', 'video', 'image'
31
- data_language: { type: String }, // detected language of the text
32
- data_sentiment: { type: String }, // sentiment analysis result
33
- data_timestamp: { type: Number }, // original post timestamp if different from ingestion timestamp
34
- is_reply: { type: Boolean }, // true if this text is a reply to another text
35
- reply_to_message_id: { type: Schema.Types.Mixed },
36
- metadata: { type: Object }, // platform-specific fields
37
- created_at: { type: Number, default: Date.now, required: true },
38
- updated_at: { type: Number, default: Date.now, required: true },
26
+ source_title: { type: String },
27
+ source_url: { type: String },
28
+ source_region_id: { type: String }, // INDEX
29
+ source_region_title: { type: String },
30
+ source_group_id: { type: String }, // INDEX
31
+ source_dominant_geos: { type: [String], default: [] },
32
+
33
+ // Data/Post Identifiers
34
+ data_id: { type: Schema.Types.Mixed }, // INDEX - platform-specific post ID
35
+ data_url: { type: String }, // direct link to the post
36
+ data_original_type: {
37
+ type: String,
38
+ enum: [
39
+ 'post',
40
+ 'comment',
41
+ 'reply',
42
+ 'video',
43
+ 'image',
44
+ 'photo',
45
+ 'story',
46
+ 'reel',
47
+ 'article',
48
+ 'link',
49
+ 'document',
50
+ ],
51
+ },
52
+
53
+ // Content
54
+ data_text: { type: String },
55
+ data_language: { type: String }, // ISO 639-1 code (e.g., 'en', 'ar')
56
+ data_geo: { type: [String], default: [] }, // INDEX - extracted locations
57
+
58
+ // Analysis (populated by processors)
59
+ data_sentiment: {
60
+ type: String,
61
+ enum: ['positive', 'negative', 'neutral', 'mixed'],
62
+ },
63
+ data_topics: { type: [String], default: [] }, // extracted topics/themes
64
+ data_keywords: { type: [String], default: [] }, // extracted keywords
65
+
66
+ // Media attachments
39
67
  media: {
40
68
  type: [
41
69
  {
42
- type: { type: String, enum: ['image', 'video', 'audio', 'link'] },
70
+ type: {
71
+ type: String,
72
+ enum: [
73
+ 'image',
74
+ 'video',
75
+ 'audio',
76
+ 'link',
77
+ 'document',
78
+ 'gif',
79
+ 'sticker',
80
+ ],
81
+ },
43
82
  url: { type: String },
44
83
  caption: { type: String },
84
+ thumbnail_url: { type: String },
85
+ width: { type: Number },
86
+ height: { type: Number },
87
+ duration: { type: Number }, // for video/audio in seconds
88
+ size: { type: Number }, // file size in bytes
89
+ mime_type: { type: String },
45
90
  },
46
91
  ],
47
- }, // media attachments
48
- author: { type: String }, // e.g., author name or ID
49
- author_username: { type: String }, // e.g., author username
50
- replies: { type: Schema.Types.Mixed }, // array of reply texts or IDs
51
- author_id: { type: String }, // e.g., author ID
92
+ default: [],
93
+ },
94
+
95
+ // Author information (legacy fields for backward compatibility)
96
+ author: { type: String }, // legacy - display name
97
+ author_username: { type: String }, // legacy
98
+ author_id: { type: String }, // legacy
99
+ // NEW - structured author info
100
+ author_info: {
101
+ type: {
102
+ id: { type: String },
103
+ username: { type: String },
104
+ display_name: { type: String },
105
+ avatar_url: { type: String },
106
+ is_verified: { type: Boolean },
107
+ follower_count: { type: Number },
108
+ },
109
+ },
110
+
111
+ // Reply/Thread information
112
+ is_reply: { type: Boolean, default: false },
113
+ reply_to_message_id: { type: Schema.Types.Mixed },
114
+ reply_to_author_id: { type: String },
115
+ replies: { type: Schema.Types.Mixed }, // legacy field - keep for backward compatibility
116
+ // NEW - structured replies info
117
+ replies_info: {
118
+ type: {
119
+ count: { type: Number, default: 0 },
120
+ recent_repliers: { type: [String] },
121
+ has_thread: { type: Boolean },
122
+ thread_id: { type: String },
123
+ },
124
+ },
125
+
126
+ // Engagement metrics (platform-specific)
127
+ engagement: {
128
+ type: {
129
+ views: { type: Number },
130
+ likes: { type: Number },
131
+ shares: { type: Number },
132
+ comments: { type: Number },
133
+ reactions: { type: Map, of: Number }, // e.g., { "like": 10, "love": 5 }
134
+ },
135
+ },
136
+
137
+ // Content flags
138
+ is_edited: { type: Boolean },
139
+ edit_date: { type: Number },
140
+ is_pinned: { type: Boolean },
141
+ is_deleted: { type: Boolean },
142
+ is_forwarded: { type: Boolean },
143
+ forwarded_from_id: { type: String },
144
+ forwarded_date: { type: Number },
145
+
146
+ // Hashtags and mentions (extracted for easier querying)
147
+ hashtags: { type: [String], default: [] },
148
+ mentions: { type: [String], default: [] },
149
+
150
+ // Platform-specific metadata (flexible)
151
+ metadata: { type: Object },
152
+
153
+ processing_errors: { type: String },
52
154
  },
53
155
  {
54
156
  versionKey: false,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "harvester_sdk",
3
- "version": "1.0.9",
3
+ "version": "1.0.11",
4
4
  "description": "SDK for interacting with the Harvester API",
5
5
  "main": "dist/index.js",
6
6
  "types": "dist/index.d.ts",