@web-auto/webauto 0.1.4 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (174) hide show
  1. package/apps/desktop-console/default-settings.json +2 -2
  2. package/apps/desktop-console/dist/main/index.mjs +915 -85
  3. package/apps/desktop-console/dist/main/preload.mjs +7 -0
  4. package/apps/desktop-console/dist/renderer/index.html +622 -50
  5. package/apps/desktop-console/dist/renderer/index.js +2415 -470
  6. package/apps/desktop-console/dist/renderer/run.mts +6 -5
  7. package/apps/desktop-console/entry/ui-cli.mjs +672 -0
  8. package/apps/desktop-console/entry/ui-console.mjs +416 -29
  9. package/apps/webauto/entry/account.mjs +89 -53
  10. package/apps/webauto/entry/browser-status.mjs +7 -10
  11. package/apps/webauto/entry/lib/account-detect.mjs +254 -28
  12. package/apps/webauto/entry/lib/account-store.mjs +219 -30
  13. package/apps/webauto/entry/lib/bus-publish.mjs +63 -0
  14. package/apps/webauto/entry/lib/camo-cli.mjs +93 -0
  15. package/apps/webauto/entry/lib/profilepool.mjs +14 -5
  16. package/apps/webauto/entry/lib/quota-status.mjs +23 -0
  17. package/apps/webauto/entry/lib/schedule-store.mjs +1068 -0
  18. package/apps/webauto/entry/profilepool.mjs +106 -17
  19. package/apps/webauto/entry/schedule.mjs +612 -0
  20. package/apps/webauto/entry/weibo-unified.mjs +134 -0
  21. package/apps/webauto/entry/xhs-install.mjs +236 -29
  22. package/apps/webauto/entry/xhs-status.mjs +5 -2
  23. package/apps/webauto/entry/xhs-unified.mjs +631 -98
  24. package/apps/webauto/resources/container-library/weibo/weibo_detail_page/comment_item/container.json +40 -0
  25. package/apps/webauto/resources/container-library/weibo/weibo_detail_page/reply_expand_button/container.json +38 -0
  26. package/apps/webauto/resources/container-library/weibo/weibo_detail_page/reply_list/container.json +37 -0
  27. package/apps/webauto/resources/container-library/weibo/weibo_search_page/container.json +8 -3
  28. package/apps/webauto/resources/container-library/weibo/weibo_search_page/login_anchor/container.json +30 -0
  29. package/apps/webauto/resources/container-library/weibo/weibo_search_page/search_bar/container.json +47 -0
  30. package/apps/webauto/resources/container-library/weibo/weibo_search_page/search_button/container.json +39 -0
  31. package/bin/camoufox-cli.mjs +61 -0
  32. package/bin/webauto.mjs +301 -54
  33. package/dist/modules/camo-backend/src/index.js +49 -1
  34. package/dist/modules/camo-backend/src/internal/BrowserSession.js +572 -3
  35. package/dist/modules/camo-backend/src/internal/SessionManager.js +13 -1
  36. package/dist/modules/camo-backend/src/internal/storage-paths.js +6 -0
  37. package/dist/modules/collection-manager/bloom-filter.js +91 -0
  38. package/dist/modules/collection-manager/date-utils.js +275 -0
  39. package/dist/modules/collection-manager/index.js +258 -0
  40. package/dist/modules/collection-manager/storage.js +195 -0
  41. package/dist/modules/collection-manager/types.js +47 -0
  42. package/dist/modules/logging/src/index.js +1 -1
  43. package/dist/modules/process-registry/index.js +230 -0
  44. package/dist/modules/rate-limiter/index.js +242 -0
  45. package/dist/modules/workflow/blocks/ExecuteWeiboSearchBlock.js +128 -0
  46. package/dist/modules/workflow/blocks/PersistXhsNoteBlock.js +7 -3
  47. package/dist/modules/workflow/blocks/RenderMarkdown.js +4 -1
  48. package/dist/modules/workflow/blocks/WeiboCollectCommentsBlock.js +282 -0
  49. package/dist/modules/workflow/blocks/WeiboCollectFromLinksBlock.js +283 -0
  50. package/dist/modules/workflow/blocks/WeiboCollectSearchLinksBlock.js +208 -0
  51. package/dist/modules/workflow/blocks/WeiboCollectTimelineListBlock.js +128 -0
  52. package/dist/modules/workflow/blocks/WeiboCollectUserPostsListBlock.js +127 -0
  53. package/dist/modules/workflow/blocks/helpers/downloadPaths.js +21 -0
  54. package/dist/modules/workflow/config/workflowRegistry.js +2 -0
  55. package/dist/modules/workflow/definitions/weibo-search-workflow-v1.js +47 -0
  56. package/dist/modules/workflow/src/runner.js +6 -0
  57. package/dist/modules/xiaohongshu/app/src/blocks/Phase34PersistDetailBlock.js +4 -0
  58. package/dist/modules/xiaohongshu/app/src/blocks/Phase3InteractBlock.js +2 -2
  59. package/dist/modules/xiaohongshu/app/src/blocks/helpers/sharding.js +123 -0
  60. package/dist/modules/xiaohongshu/app/src/container-registry/src/index.d.ts +37 -0
  61. package/dist/modules/xiaohongshu/app/src/container-registry/src/index.js +184 -0
  62. package/dist/modules/xiaohongshu/app/src/workflow/blocks/AnchorVerificationBlock.d.ts +31 -0
  63. package/dist/modules/xiaohongshu/app/src/workflow/blocks/AnchorVerificationBlock.js +71 -0
  64. package/dist/modules/xiaohongshu/app/src/workflow/blocks/DetectPageStateBlock.d.ts +48 -0
  65. package/dist/modules/xiaohongshu/app/src/workflow/blocks/DetectPageStateBlock.js +259 -0
  66. package/dist/modules/xiaohongshu/app/src/workflow/blocks/ErrorRecoveryBlock.d.ts +28 -0
  67. package/dist/modules/xiaohongshu/app/src/workflow/blocks/ErrorRecoveryBlock.js +319 -0
  68. package/dist/modules/xiaohongshu/app/src/workflow/blocks/WaitSearchPermitBlock.d.ts +36 -0
  69. package/dist/modules/xiaohongshu/app/src/workflow/blocks/WaitSearchPermitBlock.js +162 -0
  70. package/dist/modules/xiaohongshu/app/src/workflow/blocks/helpers/containerAnchors.d.ts +36 -0
  71. package/dist/modules/xiaohongshu/app/src/workflow/blocks/helpers/containerAnchors.js +301 -0
  72. package/dist/modules/xiaohongshu/app/src/workflow/blocks/helpers/operationLogger.d.ts +29 -0
  73. package/dist/modules/xiaohongshu/app/src/workflow/blocks/helpers/operationLogger.js +195 -0
  74. package/dist/modules/xiaohongshu/app/src/workflow/blocks/helpers/searchPageState.d.ts +25 -0
  75. package/dist/modules/xiaohongshu/app/src/workflow/blocks/helpers/searchPageState.js +164 -0
  76. package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/MatchCommentsBlock.d.ts +66 -0
  77. package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/MatchCommentsBlock.js +139 -0
  78. package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/Phase1EnsureServicesBlock.d.ts +16 -0
  79. package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/Phase1EnsureServicesBlock.js +36 -0
  80. package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/Phase1MonitorCookieBlock.d.ts +27 -0
  81. package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/Phase1MonitorCookieBlock.js +213 -0
  82. package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/Phase1StartProfileBlock.d.ts +18 -0
  83. package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/Phase1StartProfileBlock.js +121 -0
  84. package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/Phase2CollectLinksBlock.d.ts +34 -0
  85. package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/Phase2CollectLinksBlock.js +1249 -0
  86. package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/Phase2SearchBlock.d.ts +17 -0
  87. package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/Phase2SearchBlock.js +703 -0
  88. package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/Phase34CloseDetailBlock.d.ts +15 -0
  89. package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/Phase34CloseDetailBlock.js +41 -0
  90. package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/Phase34CloseTabsBlock.d.ts +26 -0
  91. package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/Phase34CloseTabsBlock.js +44 -0
  92. package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/Phase34CollectCommentsBlock.d.ts +29 -0
  93. package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/Phase34CollectCommentsBlock.js +150 -0
  94. package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/Phase34ExtractDetailBlock.d.ts +38 -0
  95. package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/Phase34ExtractDetailBlock.js +117 -0
  96. package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/Phase34OpenDetailBlock.d.ts +30 -0
  97. package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/Phase34OpenDetailBlock.js +102 -0
  98. package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/Phase34OpenTabsBlock.d.ts +23 -0
  99. package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/Phase34OpenTabsBlock.js +109 -0
  100. package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/Phase34PersistDetailBlock.d.ts +32 -0
  101. package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/Phase34PersistDetailBlock.js +117 -0
  102. package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/Phase34ProcessSingleNoteBlock.d.ts +35 -0
  103. package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/Phase34ProcessSingleNoteBlock.js +114 -0
  104. package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/Phase34ValidateLinksBlock.d.ts +34 -0
  105. package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/Phase34ValidateLinksBlock.js +90 -0
  106. package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/Phase3InteractBlock.d.ts +111 -0
  107. package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/Phase3InteractBlock.js +1009 -0
  108. package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/Phase4MultiTabHarvestBlock.d.ts +20 -0
  109. package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/Phase4MultiTabHarvestBlock.js +233 -0
  110. package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/ReplyInteractBlock.d.ts +48 -0
  111. package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/ReplyInteractBlock.js +291 -0
  112. package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/XhsDiscoverFallbackBlock.d.ts +23 -0
  113. package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/XhsDiscoverFallbackBlock.js +240 -0
  114. package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/helpers/commentMatchDsl.d.ts +55 -0
  115. package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/helpers/commentMatchDsl.js +126 -0
  116. package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/helpers/commentMatcher.d.ts +21 -0
  117. package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/helpers/commentMatcher.js +99 -0
  118. package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/helpers/evidence.d.ts +5 -0
  119. package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/helpers/evidence.js +27 -0
  120. package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/helpers/sharding.d.ts +37 -0
  121. package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/helpers/sharding.js +165 -0
  122. package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/helpers/xhsComments.d.ts +33 -0
  123. package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/helpers/xhsComments.js +270 -0
  124. package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/index.d.ts +9 -0
  125. package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/index.js +9 -0
  126. package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/utils/checkpoints.d.ts +50 -0
  127. package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/utils/checkpoints.js +222 -0
  128. package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/utils/controllerAction.d.ts +10 -0
  129. package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/utils/controllerAction.js +43 -0
  130. package/dist/services/shared/serviceProcessLogger.js +1 -1
  131. package/dist/services/unified-api/server.js +105 -11
  132. package/modules/camo-backend/src/index.ts +46 -1
  133. package/modules/camo-backend/src/internal/BrowserSession.ts +619 -3
  134. package/modules/camo-backend/src/internal/SessionManager.ts +12 -1
  135. package/modules/camo-backend/src/internal/storage-paths.ts +5 -0
  136. package/modules/camo-runtime/src/autoscript/action-providers/xhs/comments.mjs +38 -2
  137. package/modules/camo-runtime/src/autoscript/action-providers/xhs/interaction.mjs +47 -2
  138. package/modules/camo-runtime/src/autoscript/action-providers/xhs/search.mjs +94 -11
  139. package/modules/camo-runtime/src/autoscript/action-providers/xhs.mjs +208 -2
  140. package/modules/camo-runtime/src/autoscript/runtime.mjs +7 -1
  141. package/modules/camo-runtime/src/autoscript/xhs-unified-template.mjs +76 -43
  142. package/modules/camo-runtime/src/container/runtime-core/operations/index.mjs +75 -1
  143. package/modules/camo-runtime/src/container/runtime-core/operations/selector-scripts.mjs +71 -4
  144. package/modules/camo-runtime/src/container/runtime-core/operations/tab-pool.mjs +183 -27
  145. package/modules/collection-manager/bloom-filter.ts +112 -0
  146. package/modules/collection-manager/date-utils.ts +316 -0
  147. package/modules/collection-manager/index.ts +309 -0
  148. package/modules/collection-manager/package.json +10 -0
  149. package/modules/collection-manager/storage.ts +174 -0
  150. package/modules/collection-manager/types.ts +156 -0
  151. package/modules/logging/src/index.ts +1 -1
  152. package/modules/process-registry/index.ts +284 -0
  153. package/modules/rate-limiter/index.ts +322 -0
  154. package/modules/state/src/paths.ts +9 -1
  155. package/modules/task-scheduler/index.ts +293 -0
  156. package/modules/workflow/blocks/ExecuteWeiboSearchBlock.ts +167 -0
  157. package/modules/workflow/blocks/PersistXhsNoteBlock.ts +7 -3
  158. package/modules/workflow/blocks/RenderMarkdown.ts +4 -1
  159. package/modules/workflow/blocks/WeiboCollectCommentsBlock.ts +339 -0
  160. package/modules/workflow/blocks/WeiboCollectFromLinksBlock.ts +338 -0
  161. package/modules/workflow/blocks/helpers/downloadPaths.ts +16 -0
  162. package/modules/workflow/config/workflowRegistry.ts +2 -0
  163. package/modules/workflow/definitions/weibo-search-workflow-v1.ts +47 -0
  164. package/modules/workflow/src/runner.ts +6 -0
  165. package/modules/xiaohongshu/app/src/blocks/Phase1StartProfileBlock.ts +1 -1
  166. package/modules/xiaohongshu/app/src/blocks/Phase34PersistDetailBlock.ts +4 -0
  167. package/modules/xiaohongshu/app/src/blocks/Phase3InteractBlock.ts +2 -3
  168. package/modules/xiaohongshu/app/src/blocks/helpers/sharding.ts +152 -0
  169. package/package.json +13 -4
  170. package/scripts/postinstall-resources.mjs +62 -0
  171. package/scripts/test/run-coverage.mjs +76 -0
  172. package/scripts/weibo/search.ts +49 -0
  173. package/services/shared/serviceProcessLogger.ts +1 -1
  174. package/services/unified-api/server.ts +98 -12
@@ -0,0 +1,309 @@
1
+ /**
2
+ * Collection Data Manager
3
+ *
4
+ * Unified data management for all platforms and collection types.
5
+ * Handles:
6
+ * - Deduplication via Bloom Filter
7
+ * - Fresh/Incremental modes
8
+ * - File storage with human-readable paths
9
+ * - Stats and persistence
10
+ */
11
+
12
+ import * as os from 'os';
13
+ import * as path from 'path';
14
+ import { BloomFilter } from './bloom-filter';
15
+ import { CollectionStorage } from './storage';
16
+ import { getCurrentTimestamp, parsePlatformDate } from './date-utils';
17
+ import type {
18
+ Platform,
19
+ CollectionSource,
20
+ CollectionMode,
21
+ CollectionMeta,
22
+ CollectionIdSpec,
23
+ PostRecord,
24
+ CommentRecord
25
+ } from './types';
26
+ import { buildCollectionId, parseCollectionId } from './types';
27
+
28
+ export interface CollectionManagerOptions {
29
+ platform: Platform;
30
+ env: string;
31
+ spec: CollectionIdSpec;
32
+ mode?: CollectionMode;
33
+ baseDir?: string;
34
+ }
35
+
36
+ export interface CollectionStats {
37
+ totalPosts: number;
38
+ totalComments: number;
39
+ newPosts: number;
40
+ newComments: number;
41
+ duplicatesSkipped: number;
42
+ }
43
+
44
+ export class CollectionDataManager {
45
+ private platform: Platform;
46
+ private env: string;
47
+ private spec: CollectionIdSpec;
48
+ private mode: CollectionMode;
49
+ private collectionId: string;
50
+ private storage: CollectionStorage;
51
+ private bloomFilter: BloomFilter;
52
+ private meta: CollectionMeta | null = null;
53
+ private stats: CollectionStats = {
54
+ totalPosts: 0,
55
+ totalComments: 0,
56
+ newPosts: 0,
57
+ newComments: 0,
58
+ duplicatesSkipped: 0
59
+ };
60
+
61
+ constructor(options: CollectionManagerOptions) {
62
+ this.platform = options.platform;
63
+ this.env = options.env;
64
+ this.spec = options.spec;
65
+ this.mode = options.mode || 'incremental';
66
+
67
+ // Build human-readable collection ID
68
+ this.collectionId = buildCollectionId(options.spec);
69
+
70
+ // Default base directory
71
+ const baseDir = options.baseDir || path.join(os.homedir(), '.webauto', 'download');
72
+
73
+ this.storage = new CollectionStorage(baseDir, this.platform, this.env, this.collectionId);
74
+ this.bloomFilter = new BloomFilter(500000, 0.001); // 500k items, 0.1% false positive
75
+ }
76
+
77
+ /**
78
+ * Initialize collection manager
79
+ * - Load existing meta if available
80
+ * - Load existing bloom filter for incremental mode
81
+ * - Clear data for fresh mode
82
+ */
83
+ async init(): Promise<void> {
84
+ await this.storage.init();
85
+
86
+ const existingMeta = await this.storage.readMeta();
87
+
88
+ if (this.mode === 'fresh') {
89
+ // Clear all existing data
90
+ await this.storage.clear();
91
+ this.meta = this.createMeta();
92
+ await this.storage.writeMeta(this.meta);
93
+ } else if (existingMeta) {
94
+ // Incremental mode with existing data
95
+ this.meta = existingMeta;
96
+ this.meta.updatedAt = new Date().toISOString();
97
+
98
+ // Load existing bloom filter state if available; fallback to rebuild from posts.
99
+ let loadedFromMeta = false;
100
+ if (existingMeta.bloomFilter) {
101
+ try {
102
+ this.bloomFilter = BloomFilter.import(existingMeta.bloomFilter, 500000);
103
+ loadedFromMeta = true;
104
+ } catch {
105
+ loadedFromMeta = false;
106
+ }
107
+ }
108
+
109
+ if (!loadedFromMeta) {
110
+ const posts = await this.storage.readPosts();
111
+ for (const post of posts) {
112
+ this.bloomFilter.add(post.id);
113
+ }
114
+ }
115
+
116
+ await this.storage.writeMeta(this.meta);
117
+ } else {
118
+ // Incremental mode, but no existing data
119
+ this.meta = this.createMeta();
120
+ await this.storage.writeMeta(this.meta);
121
+ }
122
+
123
+ // Initialize stats from existing data
124
+ const storageStats = await this.storage.getStats();
125
+ this.stats.totalPosts = storageStats.postCount;
126
+ this.stats.totalComments = storageStats.commentCount;
127
+ }
128
+
129
+ private createMeta(): CollectionMeta {
130
+ return {
131
+ platform: this.platform,
132
+ env: this.env,
133
+ collectionId: this.collectionId,
134
+ source: this.spec.source,
135
+ createdAt: new Date().toISOString(),
136
+ updatedAt: new Date().toISOString(),
137
+ totalPosts: 0,
138
+ totalComments: 0,
139
+ mode: this.mode,
140
+ ...(this.spec.source === 'search' && { keyword: (this.spec as { keyword: string }).keyword }),
141
+ ...(this.spec.source === 'user' && {
142
+ userId: (this.spec as { userId: string }).userId,
143
+ userName: (this.spec as { userName?: string }).userName
144
+ })
145
+ };
146
+ }
147
+
148
+ /**
149
+ * Check if post ID already exists (via bloom filter)
150
+ */
151
+ hasPost(postId: string): boolean {
152
+ return this.bloomFilter.mightContain(postId);
153
+ }
154
+
155
+ /**
156
+ * Add a post if not duplicate
157
+ * Returns true if added, false if duplicate
158
+ */
159
+ async addPost(post: PostRecord): Promise<boolean> {
160
+ // Check bloom filter
161
+ if (this.bloomFilter.mightContain(post.id)) {
162
+ this.stats.duplicatesSkipped++;
163
+ return false;
164
+ }
165
+
166
+ // Auto-fill collectedAt fields if not provided
167
+ if (!post.collectedAt) {
168
+ const ts = getCurrentTimestamp();
169
+ post.collectedAt = ts.collectedAt;
170
+ post.collectedAtLocal = ts.collectedAtLocal;
171
+ post.collectedDate = ts.collectedDate;
172
+ }
173
+
174
+ // Add to bloom filter and storage
175
+ this.bloomFilter.add(post.id);
176
+ await this.storage.appendPost(post);
177
+
178
+ this.stats.totalPosts++;
179
+ this.stats.newPosts++;
180
+
181
+ return true;
182
+ }
183
+
184
+ /**
185
+ * Add a comment (no deduplication for comments within same post)
186
+ * But we use postId:commentId as the key to dedupe
187
+ */
188
+ async addComment(comment: CommentRecord): Promise<boolean> {
189
+ const commentKey = `${comment.postId}:${comment.id}`;
190
+
191
+ if (this.bloomFilter.mightContain(commentKey)) {
192
+ return false;
193
+ }
194
+
195
+ this.bloomFilter.add(commentKey);
196
+ await this.storage.appendComment(comment);
197
+
198
+ this.stats.totalComments++;
199
+ this.stats.newComments++;
200
+
201
+ return true;
202
+ }
203
+
204
+ /**
205
+ * Persist metadata and bloom filter state
206
+ */
207
+ async persist(): Promise<void> {
208
+ if (!this.meta) return;
209
+
210
+ this.meta.updatedAt = new Date().toISOString();
211
+ this.meta.totalPosts = this.stats.totalPosts;
212
+ this.meta.totalComments = this.stats.totalComments;
213
+ this.meta.bloomFilter = this.bloomFilter.export();
214
+
215
+ await this.storage.writeMeta(this.meta);
216
+ }
217
+
218
+ /**
219
+ * Get current stats
220
+ */
221
+ getStats(): CollectionStats {
222
+ return { ...this.stats };
223
+ }
224
+
225
+ /**
226
+ * Get collection metadata
227
+ */
228
+ getMeta(): CollectionMeta | null {
229
+ return this.meta;
230
+ }
231
+
232
+ /**
233
+ * Get collection ID
234
+ */
235
+ getCollectionId(): string {
236
+ return this.collectionId;
237
+ }
238
+
239
+ /**
240
+ * Get storage paths for external use (e.g., logging)
241
+ */
242
+ getPaths(): {
243
+ collectionDir: string;
244
+ postsPath: string;
245
+ commentsPath: string;
246
+ linksPath: string;
247
+ runLogPath: string;
248
+ } {
249
+ return {
250
+ collectionDir: (this.storage as any).collectionDir,
251
+ postsPath: this.storage.getPostsPath(),
252
+ commentsPath: this.storage.getCommentsPath(),
253
+ linksPath: this.storage.getLinksPath(),
254
+ runLogPath: this.storage.getRunLogPath()
255
+ };
256
+ }
257
+
258
+ /**
259
+ * List all collections for this platform
260
+ */
261
+ static async listCollections(
262
+ platform: Platform,
263
+ env: string,
264
+ baseDir?: string
265
+ ): Promise<Array<{ collectionId: string; spec: CollectionIdSpec | null }>> {
266
+ const dir = baseDir || path.join(os.homedir(), '.webauto', 'download');
267
+ const ids = await CollectionStorage.listCollections(dir, platform, env);
268
+
269
+ return ids.map(id => ({
270
+ collectionId: id,
271
+ spec: parseCollectionId(id)
272
+ }));
273
+ }
274
+
275
+ /**
276
+ * Merge multiple collections into one
277
+ * Useful for combining timeline data from multiple dates
278
+ */
279
+ static async mergeCollections(
280
+ sourceIds: string[],
281
+ targetId: string,
282
+ platform: Platform,
283
+ env: string,
284
+ baseDir?: string
285
+ ): Promise<void> {
286
+ const dir = baseDir || path.join(os.homedir(), '.webauto', 'download');
287
+ const targetStorage = new CollectionStorage(dir, platform, env, targetId);
288
+ await targetStorage.init();
289
+
290
+ const seenPostIds = new Set<string>();
291
+
292
+ for (const sourceId of sourceIds) {
293
+ const sourceStorage = new CollectionStorage(dir, platform, env, sourceId);
294
+ const posts = await sourceStorage.readPosts();
295
+
296
+ for (const post of posts) {
297
+ if (!seenPostIds.has(post.id)) {
298
+ seenPostIds.add(post.id);
299
+ await targetStorage.appendPost(post);
300
+ }
301
+ }
302
+ }
303
+ }
304
+ }
305
+
306
+ export { buildCollectionId, parseCollectionId };
307
+ export type { CollectionIdSpec, CollectionMeta, PostRecord, CommentRecord };
308
+
309
+ export { getCurrentTimestamp, parsePlatformDate, extractWeiboPostDate } from "./date-utils.js";
@@ -0,0 +1,10 @@
1
+ {
2
+ "name": "@webauto/collection-manager",
3
+ "version": "1.0.0",
4
+ "main": "index.ts",
5
+ "types": "index.ts",
6
+ "dependencies": {},
7
+ "devDependencies": {
8
+ "@types/node": "^20.0.0"
9
+ }
10
+ }
@@ -0,0 +1,174 @@
1
+ /**
2
+ * File storage utilities for collection data
3
+ */
4
+
5
+ import * as fs from 'fs';
6
+ import * as path from 'path';
7
+ import type { CollectionMeta, PostRecord, CommentRecord } from './types';
8
+
9
+ export class CollectionStorage {
10
+ private baseDir: string;
11
+ private collectionDir: string;
12
+ private platform: string;
13
+ private env: string;
14
+ private collectionId: string;
15
+
16
+ constructor(baseDir: string, platform: string, env: string, collectionId: string) {
17
+ this.baseDir = baseDir;
18
+ this.platform = platform;
19
+ this.env = env;
20
+ this.collectionId = collectionId;
21
+
22
+ // Directory structure: baseDir/platform/env/collectionId/
23
+ this.collectionDir = path.join(baseDir, platform, env, collectionId);
24
+ }
25
+
26
+ /**
27
+ * Initialize storage directories
28
+ */
29
+ async init(): Promise<void> {
30
+ await fs.promises.mkdir(this.collectionDir, { recursive: true });
31
+ }
32
+
33
+ /**
34
+ * Get paths for different files
35
+ */
36
+ getMetaPath(): string {
37
+ return path.join(this.collectionDir, 'collection-meta.json');
38
+ }
39
+
40
+ getPostsPath(): string {
41
+ return path.join(this.collectionDir, 'posts.jsonl');
42
+ }
43
+
44
+ getCommentsPath(): string {
45
+ return path.join(this.collectionDir, 'comments.jsonl');
46
+ }
47
+
48
+ getLinksPath(): string {
49
+ return path.join(this.collectionDir, 'links.jsonl');
50
+ }
51
+
52
+ getRunLogPath(): string {
53
+ return path.join(this.collectionDir, 'run.log');
54
+ }
55
+
56
+ getEventsPath(): string {
57
+ return path.join(this.collectionDir, 'run-events.jsonl');
58
+ }
59
+
60
+ /**
61
+ * Read/write meta
62
+ */
63
+ async readMeta(): Promise<CollectionMeta | null> {
64
+ const metaPath = this.getMetaPath();
65
+ try {
66
+ const data = await fs.promises.readFile(metaPath, 'utf-8');
67
+ return JSON.parse(data);
68
+ } catch {
69
+ return null;
70
+ }
71
+ }
72
+
73
+ async writeMeta(meta: CollectionMeta): Promise<void> {
74
+ const metaPath = this.getMetaPath();
75
+ await fs.promises.writeFile(metaPath, JSON.stringify(meta, null, 2), 'utf-8');
76
+ }
77
+
78
+ /**
79
+ * Append post to JSONL
80
+ */
81
+ async appendPost(post: PostRecord): Promise<void> {
82
+ const line = JSON.stringify(post) + '\n';
83
+ await fs.promises.appendFile(this.getPostsPath(), line, 'utf-8');
84
+ }
85
+
86
+ /**
87
+ * Append comment to JSONL
88
+ */
89
+ async appendComment(comment: CommentRecord): Promise<void> {
90
+ const line = JSON.stringify(comment) + '\n';
91
+ await fs.promises.appendFile(this.getCommentsPath(), line, 'utf-8');
92
+ }
93
+
94
+ /**
95
+ * Read all posts
96
+ */
97
+ async readPosts(): Promise<PostRecord[]> {
98
+ try {
99
+ const data = await fs.promises.readFile(this.getPostsPath(), 'utf-8');
100
+ return data.trim().split('\n').map(line => JSON.parse(line));
101
+ } catch {
102
+ return [];
103
+ }
104
+ }
105
+
106
+ /**
107
+ * Read all comments
108
+ */
109
+ async readComments(): Promise<CommentRecord[]> {
110
+ try {
111
+ const data = await fs.promises.readFile(this.getCommentsPath(), 'utf-8');
112
+ return data.trim().split('\n').map(line => JSON.parse(line));
113
+ } catch {
114
+ return [];
115
+ }
116
+ }
117
+
118
+ /**
119
+ * Count lines in JSONL file
120
+ */
121
+ async countLines(filePath: string): Promise<number> {
122
+ try {
123
+ const data = await fs.promises.readFile(filePath, 'utf-8');
124
+ return data.trim().split('\n').filter(line => line.length > 0).length;
125
+ } catch {
126
+ return 0;
127
+ }
128
+ }
129
+
130
+ /**
131
+ * Get collection stats
132
+ */
133
+ async getStats(): Promise<{ postCount: number; commentCount: number }> {
134
+ return {
135
+ postCount: await this.countLines(this.getPostsPath()),
136
+ commentCount: await this.countLines(this.getCommentsPath())
137
+ };
138
+ }
139
+
140
+ /**
141
+ * Clear all collection data (for fresh mode)
142
+ */
143
+ async clear(): Promise<void> {
144
+ const files = [
145
+ this.getPostsPath(),
146
+ this.getCommentsPath(),
147
+ this.getLinksPath(),
148
+ this.getEventsPath()
149
+ ];
150
+
151
+ for (const file of files) {
152
+ try {
153
+ await fs.promises.unlink(file);
154
+ } catch {
155
+ // File might not exist
156
+ }
157
+ }
158
+ }
159
+
160
+ /**
161
+ * List all collections for a platform/env
162
+ */
163
+ static async listCollections(baseDir: string, platform: string, env: string): Promise<string[]> {
164
+ const dir = path.join(baseDir, platform, env);
165
+ try {
166
+ const entries = await fs.promises.readdir(dir, { withFileTypes: true });
167
+ return entries
168
+ .filter(e => e.isDirectory())
169
+ .map(e => e.name);
170
+ } catch {
171
+ return [];
172
+ }
173
+ }
174
+ }
@@ -0,0 +1,156 @@
1
+ /** Collection types for unified data management across platforms */
2
+
3
+ export type Platform = 'xiaohongshu' | 'weibo' | '1688';
4
+
5
+ export type CollectionSource = 'search' | 'timeline' | 'user' | 'note' | 'product';
6
+
7
+ export type CollectionMode = 'fresh' | 'incremental';
8
+
9
+ export interface CollectionMeta {
10
+ platform: Platform;
11
+ env: string;
12
+ /** Human-readable identifier - varies by source type */
13
+ collectionId: string;
14
+ /** Source type */
15
+ source: CollectionSource;
16
+ /** Creation timestamp */
17
+ createdAt: string;
18
+ /** Last update timestamp */
19
+ updatedAt: string;
20
+ /** Total posts collected */
21
+ totalPosts: number;
22
+ /** Total comments collected */
23
+ totalComments: number;
24
+ /** Collection mode */
25
+ mode: CollectionMode;
26
+ /** Bloom filter state (base64) */
27
+ bloomFilter?: string;
28
+ /** Optional: search keyword */
29
+ keyword?: string;
30
+ /** Optional: user ID for user monitoring */
31
+ userId?: string;
32
+ /** Optional: user name for display */
33
+ userName?: string;
34
+ }
35
+
36
+ export interface PostRecord {
37
+ /** Unique ID from platform */
38
+ id: string;
39
+ /** Post URL */
40
+ url: string;
41
+ /** Author ID */
42
+ authorId?: string;
43
+ /** Author name */
44
+ authorName?: string;
45
+ /** Post content */
46
+ content?: string;
47
+ /** Collected timestamp (ISO 8601 UTC) */
48
+ collectedAt: string;
49
+ /** Collected timestamp (local with timezone) */
50
+ collectedAtLocal?: string;
51
+ /** Collected date (YYYY-MM-DD, local) */
52
+ collectedDate?: string;
53
+ /** Published date extracted from content (YYYY-MM-DD) */
54
+ publishedDate?: string;
55
+ /** Published time extracted from content (HH:MM) */
56
+ publishedTime?: string;
57
+ /** Full published datetime text */
58
+ publishedAtText?: string;
59
+ /** Raw data from platform */
60
+ raw?: Record<string, unknown>;
61
+ /** Comments count */
62
+ commentsCount?: number;
63
+ /** Likes count */
64
+ likesCount?: number;
65
+ }
66
+
67
+ export interface CommentRecord {
68
+ /** Unique ID */
69
+ id: string;
70
+ /** Parent post ID */
71
+ postId: string;
72
+ /** Author ID */
73
+ authorId?: string;
74
+ /** Author name */
75
+ authorName?: string;
76
+ /** Comment content */
77
+ content?: string;
78
+ /** Parent comment ID (for nested comments) */
79
+ parentId?: string;
80
+ /** Reply to user name */
81
+ replyToName?: string;
82
+ /** Collected timestamp */
83
+ collectedAt: string;
84
+ /** Raw data */
85
+ raw?: Record<string, unknown>;
86
+ }
87
+
88
+ /**
89
+ * Collection ID naming conventions:
90
+ *
91
+ * search: "search:<keyword>" -> search:春晚
92
+ * timeline: "timeline:<date>" -> timeline:2026-02-20
93
+ * user: "user:<userId>:<userName>" -> user:1234567890:张三
94
+ * note: "note:<noteId>" -> note:abc123
95
+ * product: "product:<keyword>" -> product:女装
96
+ */
97
+ export type CollectionIdSpec = {
98
+ source: 'search';
99
+ keyword: string;
100
+ } | {
101
+ source: 'timeline';
102
+ date: string; // YYYY-MM-DD
103
+ } | {
104
+ source: 'user';
105
+ userId: string;
106
+ userName?: string;
107
+ } | {
108
+ source: 'note';
109
+ noteId: string;
110
+ } | {
111
+ source: 'product';
112
+ keyword: string;
113
+ };
114
+
115
+ /**
116
+ * Build collection ID from spec
117
+ */
118
+ export function buildCollectionId(spec: CollectionIdSpec): string {
119
+ switch (spec.source) {
120
+ case 'search':
121
+ return `search:${spec.keyword}`;
122
+ case 'timeline':
123
+ return `timeline:${spec.date}`;
124
+ case 'user':
125
+ const safeName = spec.userName?.replace(/[\/\\:*?"<>|]/g, '_') || 'unknown';
126
+ return `user:${spec.userId}:${safeName}`;
127
+ case 'note':
128
+ return `note:${spec.noteId}`;
129
+ case 'product':
130
+ return `product:${spec.keyword}`;
131
+ }
132
+ }
133
+
134
+ /**
135
+ * Parse collection ID back to spec
136
+ */
137
+ export function parseCollectionId(id: string): CollectionIdSpec | null {
138
+ const parts = id.split(':');
139
+ if (parts.length < 2) return null;
140
+
141
+ const source = parts[0] as CollectionSource;
142
+ switch (source) {
143
+ case 'search':
144
+ return { source: 'search', keyword: parts.slice(1).join(':') };
145
+ case 'timeline':
146
+ return { source: 'timeline', date: parts[1] };
147
+ case 'user':
148
+ return { source: 'user', userId: parts[1], userName: parts[2] };
149
+ case 'note':
150
+ return { source: 'note', noteId: parts[1] };
151
+ case 'product':
152
+ return { source: 'product', keyword: parts.slice(1).join(':') };
153
+ default:
154
+ return null;
155
+ }
156
+ }
@@ -53,7 +53,7 @@ export function logDebug(module: string, event: string, data: Record<string, any
53
53
  data,
54
54
  };
55
55
  try {
56
- fs.appendFileSync(DEBUG_LOG_FILE, `${JSON.stringify(entry)}\n`);
56
+ fs.appendFileSync(DEBUG_LOG_FILE, `${JSON.stringify(entry)}\n`, 'utf8');
57
57
  } catch {
58
58
  // ignore
59
59
  }