harvester_sdk 1.0.9 → 1.0.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +198 -42
- package/dist/index.js +105 -15
- package/dist/types.d.ts +1507 -44
- package/dist/types.js +167 -27
- package/index.ts +129 -27
- package/package.json +1 -1
- package/types.ts +212 -47
package/dist/types.js
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
"use strict";
|
|
2
2
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
-
exports.getAllowedEntitiesForPlatform = exports.zodDataSchema = exports.zodGeoSelectionSchema = exports.zodGeoSchema = exports.zodSourceSchema = exports.zodSourceGroupSchema = exports.zodRegionSchema = exports.generalStatusList = exports.sourceStatusList = exports.platformEntityMap = exports.entityTypesList = exports.platformsList = void 0;
|
|
3
|
+
exports.getAllowedEntitiesForPlatform = exports.zodDataSchemaWithValidation = exports.zodDataSchema = exports.authorSchema = exports.repliesInfoSchema = exports.mediaItemSchema = exports.websiteMetadataSchema = exports.tiktokMetadataSchema = exports.instagramMetadataSchema = exports.facebookMetadataSchema = exports.telegramMetadataSchema = exports.zodGeoSelectionSchema = exports.zodGeoSchema = exports.zodSourceSchema = exports.zodSourceGroupSchema = exports.zodRegionSchema = exports.generalStatusList = exports.sourceStatusList = exports.platformEntityMap = exports.entityTypesList = exports.platformsList = void 0;
|
|
4
4
|
const zod_1 = require("zod");
|
|
5
5
|
exports.platformsList = [
|
|
6
6
|
'telegram',
|
|
@@ -105,46 +105,186 @@ exports.zodGeoSelectionSchema = zod_1.z.object({
|
|
|
105
105
|
created_at: zod_1.z.number().optional(),
|
|
106
106
|
updated_at: zod_1.z.number().optional(), // last update date
|
|
107
107
|
});
|
|
108
|
+
// Add specific metadata schemas for better type safety while keeping flexibility
|
|
109
|
+
exports.telegramMetadataSchema = zod_1.z.object({
|
|
110
|
+
channel_id: zod_1.z.string(),
|
|
111
|
+
views: zod_1.z.number().optional(),
|
|
112
|
+
forwards: zod_1.z.number().optional(),
|
|
113
|
+
reactions: zod_1.z.array(zod_1.z.any()).optional(),
|
|
114
|
+
hashtags: zod_1.z.array(zod_1.z.string()).optional(),
|
|
115
|
+
mentions: zod_1.z.array(zod_1.z.string()).optional(),
|
|
116
|
+
is_pinned: zod_1.z.boolean().optional(),
|
|
117
|
+
is_edited: zod_1.z.boolean().optional(),
|
|
118
|
+
edit_date: zod_1.z.number().optional(),
|
|
119
|
+
post_author: zod_1.z.string().nullable(),
|
|
120
|
+
via_bot_id: zod_1.z.string().optional(),
|
|
121
|
+
grouped_id: zod_1.z.string().optional(),
|
|
122
|
+
is_silent: zod_1.z.boolean().optional(),
|
|
123
|
+
forwarded_from_id: zod_1.z.string().optional(),
|
|
124
|
+
forwarded_date: zod_1.z.number().optional(),
|
|
125
|
+
});
|
|
126
|
+
exports.facebookMetadataSchema = zod_1.z.object({
|
|
127
|
+
post_id: zod_1.z.string(),
|
|
128
|
+
likes: zod_1.z.number().optional(),
|
|
129
|
+
shares: zod_1.z.number().optional(),
|
|
130
|
+
comments_count: zod_1.z.number().optional(),
|
|
131
|
+
reactions: zod_1.z.record(zod_1.z.string(), zod_1.z.number()).optional(),
|
|
132
|
+
is_pinned: zod_1.z.boolean().optional(),
|
|
133
|
+
is_edited: zod_1.z.boolean().optional(),
|
|
134
|
+
edit_date: zod_1.z.number().optional(),
|
|
135
|
+
});
|
|
136
|
+
exports.instagramMetadataSchema = zod_1.z.object({
|
|
137
|
+
post_id: zod_1.z.string(),
|
|
138
|
+
likes: zod_1.z.number().optional(),
|
|
139
|
+
comments_count: zod_1.z.number().optional(),
|
|
140
|
+
hashtags: zod_1.z.array(zod_1.z.string()).optional(),
|
|
141
|
+
mentions: zod_1.z.array(zod_1.z.string()).optional(),
|
|
142
|
+
location: zod_1.z.string().optional(),
|
|
143
|
+
is_reel: zod_1.z.boolean().optional(),
|
|
144
|
+
is_story: zod_1.z.boolean().optional(),
|
|
145
|
+
});
|
|
146
|
+
exports.tiktokMetadataSchema = zod_1.z.object({
|
|
147
|
+
video_id: zod_1.z.string(),
|
|
148
|
+
likes: zod_1.z.number().optional(),
|
|
149
|
+
shares: zod_1.z.number().optional(),
|
|
150
|
+
comments_count: zod_1.z.number().optional(),
|
|
151
|
+
views: zod_1.z.number().optional(),
|
|
152
|
+
hashtags: zod_1.z.array(zod_1.z.string()).optional(),
|
|
153
|
+
mentions: zod_1.z.array(zod_1.z.string()).optional(),
|
|
154
|
+
sound_name: zod_1.z.string().optional(),
|
|
155
|
+
sound_id: zod_1.z.string().optional(),
|
|
156
|
+
duration: zod_1.z.number().optional(),
|
|
157
|
+
});
|
|
158
|
+
exports.websiteMetadataSchema = zod_1.z.object({
|
|
159
|
+
article_id: zod_1.z.string().optional(),
|
|
160
|
+
url: zod_1.z.string().url(),
|
|
161
|
+
title: zod_1.z.string().optional(),
|
|
162
|
+
author: zod_1.z.string().optional(),
|
|
163
|
+
publish_date: zod_1.z.number().optional(),
|
|
164
|
+
category: zod_1.z.string().optional(),
|
|
165
|
+
tags: zod_1.z.array(zod_1.z.string()).optional(),
|
|
166
|
+
});
|
|
167
|
+
// Improved media schema with more types and optional dimensions
|
|
168
|
+
exports.mediaItemSchema = zod_1.z.object({
|
|
169
|
+
type: zod_1.z.enum(['image', 'video', 'audio', 'link', 'document', 'gif', 'sticker']),
|
|
170
|
+
url: zod_1.z.string(),
|
|
171
|
+
caption: zod_1.z.string().optional(),
|
|
172
|
+
thumbnail_url: zod_1.z.string().optional(),
|
|
173
|
+
width: zod_1.z.number().optional(),
|
|
174
|
+
height: zod_1.z.number().optional(),
|
|
175
|
+
duration: zod_1.z.number().optional(),
|
|
176
|
+
size: zod_1.z.number().optional(),
|
|
177
|
+
mime_type: zod_1.z.string().optional(),
|
|
178
|
+
});
|
|
179
|
+
// Improved replies schema
|
|
180
|
+
exports.repliesInfoSchema = zod_1.z.object({
|
|
181
|
+
count: zod_1.z.number().default(0),
|
|
182
|
+
recent_repliers: zod_1.z.array(zod_1.z.string()).optional(),
|
|
183
|
+
has_thread: zod_1.z.boolean().optional(),
|
|
184
|
+
thread_id: zod_1.z.string().optional(),
|
|
185
|
+
});
|
|
186
|
+
// Improved author schema for better consistency
|
|
187
|
+
exports.authorSchema = zod_1.z.object({
|
|
188
|
+
id: zod_1.z.string().optional(),
|
|
189
|
+
username: zod_1.z.string().optional(),
|
|
190
|
+
display_name: zod_1.z.string().optional(),
|
|
191
|
+
avatar_url: zod_1.z.string().optional(),
|
|
192
|
+
is_verified: zod_1.z.boolean().optional(),
|
|
193
|
+
follower_count: zod_1.z.number().optional(),
|
|
194
|
+
});
|
|
108
195
|
exports.zodDataSchema = zod_1.z.object({
|
|
109
196
|
_id: zod_1.z.string().optional(),
|
|
197
|
+
// Timestamps
|
|
110
198
|
timestamp: zod_1.z.number().optional(),
|
|
199
|
+
data_timestamp: zod_1.z.number().optional(),
|
|
200
|
+
created_at: zod_1.z.number(),
|
|
201
|
+
updated_at: zod_1.z.number(),
|
|
202
|
+
// Platform & Source Info (denormalized for query performance)
|
|
111
203
|
platform: zod_1.z.enum(exports.platformsList).optional(),
|
|
112
|
-
source_region_id: zod_1.z.string().optional(),
|
|
113
|
-
source_region_title: zod_1.z.string().optional(),
|
|
114
|
-
source_dominant_geos: zod_1.z.array(zod_1.z.string()).optional(),
|
|
115
204
|
source_id: zod_1.z.string().or(zod_1.z.number()),
|
|
116
205
|
source_title: zod_1.z.string().optional(),
|
|
117
206
|
source_url: zod_1.z.string().url().optional(),
|
|
207
|
+
source_region_id: zod_1.z.string().optional(),
|
|
208
|
+
source_region_title: zod_1.z.string().optional(),
|
|
118
209
|
source_group_id: zod_1.z.string().optional(),
|
|
210
|
+
source_dominant_geos: zod_1.z.array(zod_1.z.string()).optional().default([]),
|
|
211
|
+
// Data/Post Identifiers
|
|
119
212
|
data_id: zod_1.z.string().or(zod_1.z.number()).optional(),
|
|
120
|
-
data_geo: zod_1.z.array(zod_1.z.string()).optional(),
|
|
121
|
-
data_text: zod_1.z.string().optional(),
|
|
122
213
|
data_url: zod_1.z.string().url().optional(),
|
|
123
|
-
data_original_type: zod_1.z.
|
|
214
|
+
data_original_type: zod_1.z.enum([
|
|
215
|
+
'post',
|
|
216
|
+
'comment',
|
|
217
|
+
'reply',
|
|
218
|
+
'video',
|
|
219
|
+
'image',
|
|
220
|
+
'photo',
|
|
221
|
+
'story',
|
|
222
|
+
'reel',
|
|
223
|
+
'article',
|
|
224
|
+
'link',
|
|
225
|
+
'document',
|
|
226
|
+
]).optional(),
|
|
227
|
+
// Content
|
|
228
|
+
data_text: zod_1.z.string().optional(),
|
|
124
229
|
data_language: zod_1.z.string().optional(),
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
.array(zod_1.z.object({
|
|
134
|
-
type: zod_1.z.enum(['image', 'video', 'audio', 'link']),
|
|
135
|
-
url: zod_1.z.string().url(),
|
|
136
|
-
caption: zod_1.z.string().optional(),
|
|
137
|
-
}))
|
|
138
|
-
.optional(),
|
|
230
|
+
data_geo: zod_1.z.array(zod_1.z.string()).optional().default([]),
|
|
231
|
+
// Analysis (populated by processors)
|
|
232
|
+
data_sentiment: zod_1.z.enum(['positive', 'negative', 'neutral', 'mixed']).optional(),
|
|
233
|
+
data_topics: zod_1.z.array(zod_1.z.string()).optional(),
|
|
234
|
+
data_keywords: zod_1.z.array(zod_1.z.string()).optional(),
|
|
235
|
+
// Media attachments
|
|
236
|
+
media: zod_1.z.array(exports.mediaItemSchema).optional().default([]),
|
|
237
|
+
// Author information
|
|
139
238
|
author: zod_1.z.string().optional(),
|
|
140
239
|
author_username: zod_1.z.string().optional(),
|
|
240
|
+
author_id: zod_1.z.string().optional(),
|
|
241
|
+
author_info: exports.authorSchema.optional(),
|
|
242
|
+
// Reply/Thread information
|
|
243
|
+
is_reply: zod_1.z.boolean().optional().default(false),
|
|
244
|
+
reply_to_message_id: zod_1.z.string().or(zod_1.z.number()).optional(),
|
|
245
|
+
reply_to_author_id: zod_1.z.string().optional(),
|
|
141
246
|
replies: zod_1.z.any().optional(),
|
|
142
|
-
|
|
143
|
-
//
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
247
|
+
replies_info: exports.repliesInfoSchema.optional(),
|
|
248
|
+
// Engagement metrics (platform-specific)
|
|
249
|
+
engagement: zod_1.z.object({
|
|
250
|
+
views: zod_1.z.number().optional(),
|
|
251
|
+
likes: zod_1.z.number().optional(),
|
|
252
|
+
shares: zod_1.z.number().optional(),
|
|
253
|
+
comments: zod_1.z.number().optional(),
|
|
254
|
+
reactions: zod_1.z.record(zod_1.z.string(), zod_1.z.number()).optional(), // e.g., { "like": 10, "love": 5 }
|
|
255
|
+
}).optional(),
|
|
256
|
+
// Content flags
|
|
257
|
+
is_edited: zod_1.z.boolean().optional(),
|
|
258
|
+
edit_date: zod_1.z.number().optional(),
|
|
259
|
+
is_pinned: zod_1.z.boolean().optional(),
|
|
260
|
+
is_deleted: zod_1.z.boolean().optional(),
|
|
261
|
+
is_forwarded: zod_1.z.boolean().optional(),
|
|
262
|
+
forwarded_from_id: zod_1.z.string().optional(),
|
|
263
|
+
forwarded_date: zod_1.z.number().optional(),
|
|
264
|
+
// Hashtags and mentions (extracted for easier querying)
|
|
265
|
+
hashtags: zod_1.z.array(zod_1.z.string()).optional().default([]),
|
|
266
|
+
mentions: zod_1.z.array(zod_1.z.string()).optional().default([]),
|
|
267
|
+
// Platform-specific metadata (flexible)
|
|
268
|
+
metadata: zod_1.z.union([
|
|
269
|
+
exports.telegramMetadataSchema,
|
|
270
|
+
exports.facebookMetadataSchema,
|
|
271
|
+
exports.instagramMetadataSchema,
|
|
272
|
+
exports.tiktokMetadataSchema,
|
|
273
|
+
exports.websiteMetadataSchema,
|
|
274
|
+
zod_1.z.record(zod_1.z.string(), zod_1.z.any()), // fallback for unknown platforms
|
|
275
|
+
]).optional(),
|
|
276
|
+
processing_errors: zod_1.z.string().optional(),
|
|
277
|
+
});
|
|
278
|
+
// Add validation refinement for platform-specific fields
|
|
279
|
+
exports.zodDataSchemaWithValidation = exports.zodDataSchema.refine((data) => {
|
|
280
|
+
// Ensure data_id exists for most platforms except websites
|
|
281
|
+
if (data.platform && data.platform !== 'website' && !data.data_id) {
|
|
282
|
+
return false;
|
|
283
|
+
}
|
|
284
|
+
return true;
|
|
285
|
+
}, {
|
|
286
|
+
message: "data_id is required for non-website platforms",
|
|
287
|
+
path: ["data_id"],
|
|
148
288
|
});
|
|
149
289
|
// Helper function to get allowed entities for a platform
|
|
150
290
|
const getAllowedEntitiesForPlatform = (platform) => {
|
package/index.ts
CHANGED
|
@@ -11,44 +11,146 @@ import {
|
|
|
11
11
|
|
|
12
12
|
export const MongoDataSchema = new Schema(
|
|
13
13
|
{
|
|
14
|
-
|
|
14
|
+
// Timestamps
|
|
15
|
+
timestamp: { type: Number }, // INDEX - ingestion timestamp
|
|
16
|
+
data_timestamp: { type: Number }, // original post timestamp
|
|
17
|
+
created_at: { type: Number, default: Date.now, required: true },
|
|
18
|
+
updated_at: { type: Number, default: Date.now, required: true },
|
|
19
|
+
|
|
20
|
+
// Platform & Source Info (denormalized for query performance)
|
|
15
21
|
platform: {
|
|
16
22
|
type: String,
|
|
17
23
|
enum: platformsList,
|
|
18
|
-
},
|
|
19
|
-
source_region_id: { type: String }, // e.g., 'hebron'
|
|
20
|
-
source_region_title: { type: String }, // e.g., 'hebron'
|
|
21
|
-
source_dominant_geos: { type: [String], default: [] }, // e.g., ['hebron', 'west bank']
|
|
24
|
+
},
|
|
22
25
|
source_id: { type: Schema.Types.Mixed, required: true }, // INDEX - reference to Source _id
|
|
23
|
-
source_title: { type: String },
|
|
24
|
-
source_url: { type: String },
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
26
|
+
source_title: { type: String },
|
|
27
|
+
source_url: { type: String },
|
|
28
|
+
source_region_id: { type: String }, // INDEX
|
|
29
|
+
source_region_title: { type: String },
|
|
30
|
+
source_group_id: { type: String }, // INDEX
|
|
31
|
+
source_dominant_geos: { type: [String], default: [] },
|
|
32
|
+
|
|
33
|
+
// Data/Post Identifiers
|
|
34
|
+
data_id: { type: Schema.Types.Mixed }, // INDEX - platform-specific post ID
|
|
35
|
+
data_url: { type: String }, // direct link to the post
|
|
36
|
+
data_original_type: {
|
|
37
|
+
type: String,
|
|
38
|
+
enum: [
|
|
39
|
+
'post',
|
|
40
|
+
'comment',
|
|
41
|
+
'reply',
|
|
42
|
+
'video',
|
|
43
|
+
'image',
|
|
44
|
+
'photo',
|
|
45
|
+
'story',
|
|
46
|
+
'reel',
|
|
47
|
+
'article',
|
|
48
|
+
'link',
|
|
49
|
+
'document',
|
|
50
|
+
],
|
|
51
|
+
},
|
|
52
|
+
|
|
53
|
+
// Content
|
|
54
|
+
data_text: { type: String },
|
|
55
|
+
data_language: { type: String }, // ISO 639-1 code (e.g., 'en', 'ar')
|
|
56
|
+
data_geo: { type: [String], default: [] }, // INDEX - extracted locations
|
|
57
|
+
|
|
58
|
+
// Analysis (populated by processors)
|
|
59
|
+
data_sentiment: {
|
|
60
|
+
type: String,
|
|
61
|
+
enum: ['positive', 'negative', 'neutral', 'mixed'],
|
|
62
|
+
},
|
|
63
|
+
data_topics: { type: [String], default: [] }, // extracted topics/themes
|
|
64
|
+
data_keywords: { type: [String], default: [] }, // extracted keywords
|
|
65
|
+
|
|
66
|
+
// Media attachments
|
|
39
67
|
media: {
|
|
40
68
|
type: [
|
|
41
69
|
{
|
|
42
|
-
type: {
|
|
70
|
+
type: {
|
|
71
|
+
type: String,
|
|
72
|
+
enum: [
|
|
73
|
+
'image',
|
|
74
|
+
'video',
|
|
75
|
+
'audio',
|
|
76
|
+
'link',
|
|
77
|
+
'document',
|
|
78
|
+
'gif',
|
|
79
|
+
'sticker',
|
|
80
|
+
],
|
|
81
|
+
},
|
|
43
82
|
url: { type: String },
|
|
44
83
|
caption: { type: String },
|
|
84
|
+
thumbnail_url: { type: String },
|
|
85
|
+
width: { type: Number },
|
|
86
|
+
height: { type: Number },
|
|
87
|
+
duration: { type: Number }, // for video/audio in seconds
|
|
88
|
+
size: { type: Number }, // file size in bytes
|
|
89
|
+
mime_type: { type: String },
|
|
45
90
|
},
|
|
46
91
|
],
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
92
|
+
default: [],
|
|
93
|
+
},
|
|
94
|
+
|
|
95
|
+
// Author information (legacy fields for backward compatibility)
|
|
96
|
+
author: { type: String }, // legacy - display name
|
|
97
|
+
author_username: { type: String }, // legacy
|
|
98
|
+
author_id: { type: String }, // legacy
|
|
99
|
+
// NEW - structured author info
|
|
100
|
+
author_info: {
|
|
101
|
+
type: {
|
|
102
|
+
id: { type: String },
|
|
103
|
+
username: { type: String },
|
|
104
|
+
display_name: { type: String },
|
|
105
|
+
avatar_url: { type: String },
|
|
106
|
+
is_verified: { type: Boolean },
|
|
107
|
+
follower_count: { type: Number },
|
|
108
|
+
},
|
|
109
|
+
},
|
|
110
|
+
|
|
111
|
+
// Reply/Thread information
|
|
112
|
+
is_reply: { type: Boolean, default: false },
|
|
113
|
+
reply_to_message_id: { type: Schema.Types.Mixed },
|
|
114
|
+
reply_to_author_id: { type: String },
|
|
115
|
+
replies: { type: Schema.Types.Mixed }, // legacy field - keep for backward compatibility
|
|
116
|
+
// NEW - structured replies info
|
|
117
|
+
replies_info: {
|
|
118
|
+
type: {
|
|
119
|
+
count: { type: Number, default: 0 },
|
|
120
|
+
recent_repliers: { type: [String] },
|
|
121
|
+
has_thread: { type: Boolean },
|
|
122
|
+
thread_id: { type: String },
|
|
123
|
+
},
|
|
124
|
+
},
|
|
125
|
+
|
|
126
|
+
// Engagement metrics (platform-specific)
|
|
127
|
+
engagement: {
|
|
128
|
+
type: {
|
|
129
|
+
views: { type: Number },
|
|
130
|
+
likes: { type: Number },
|
|
131
|
+
shares: { type: Number },
|
|
132
|
+
comments: { type: Number },
|
|
133
|
+
reactions: { type: Map, of: Number }, // e.g., { "like": 10, "love": 5 }
|
|
134
|
+
},
|
|
135
|
+
},
|
|
136
|
+
|
|
137
|
+
// Content flags
|
|
138
|
+
is_edited: { type: Boolean },
|
|
139
|
+
edit_date: { type: Number },
|
|
140
|
+
is_pinned: { type: Boolean },
|
|
141
|
+
is_deleted: { type: Boolean },
|
|
142
|
+
is_forwarded: { type: Boolean },
|
|
143
|
+
forwarded_from_id: { type: String },
|
|
144
|
+
forwarded_date: { type: Number },
|
|
145
|
+
|
|
146
|
+
// Hashtags and mentions (extracted for easier querying)
|
|
147
|
+
hashtags: { type: [String], default: [] },
|
|
148
|
+
mentions: { type: [String], default: [] },
|
|
149
|
+
|
|
150
|
+
// Platform-specific metadata (flexible)
|
|
151
|
+
metadata: { type: Object },
|
|
152
|
+
|
|
153
|
+
processing_errors: { type: String },
|
|
52
154
|
},
|
|
53
155
|
{
|
|
54
156
|
versionKey: false,
|