@web-auto/webauto 0.1.4 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/apps/desktop-console/default-settings.json +2 -2
- package/apps/desktop-console/dist/main/index.mjs +983 -128
- package/apps/desktop-console/dist/main/preload.mjs +7 -0
- package/apps/desktop-console/dist/renderer/index.html +622 -50
- package/apps/desktop-console/dist/renderer/index.js +2423 -469
- package/apps/desktop-console/dist/renderer/run.mts +6 -5
- package/apps/desktop-console/entry/ui-cli.mjs +672 -0
- package/apps/desktop-console/entry/ui-console.mjs +416 -29
- package/apps/webauto/entry/account.mjs +89 -53
- package/apps/webauto/entry/browser-status.mjs +7 -10
- package/apps/webauto/entry/lib/account-detect.mjs +254 -28
- package/apps/webauto/entry/lib/account-store.mjs +219 -30
- package/apps/webauto/entry/lib/bus-publish.mjs +63 -0
- package/apps/webauto/entry/lib/camo-cli.mjs +93 -0
- package/apps/webauto/entry/lib/profilepool.mjs +14 -5
- package/apps/webauto/entry/lib/quota-status.mjs +23 -0
- package/apps/webauto/entry/lib/schedule-store.mjs +1068 -0
- package/apps/webauto/entry/profilepool.mjs +106 -17
- package/apps/webauto/entry/schedule.mjs +612 -0
- package/apps/webauto/entry/weibo-unified.mjs +134 -0
- package/apps/webauto/entry/xhs-install.mjs +256 -31
- package/apps/webauto/entry/xhs-status.mjs +5 -2
- package/apps/webauto/entry/xhs-unified.mjs +631 -98
- package/apps/webauto/resources/container-library/weibo/weibo_detail_page/comment_item/container.json +40 -0
- package/apps/webauto/resources/container-library/weibo/weibo_detail_page/reply_expand_button/container.json +38 -0
- package/apps/webauto/resources/container-library/weibo/weibo_detail_page/reply_list/container.json +37 -0
- package/apps/webauto/resources/container-library/weibo/weibo_search_page/container.json +8 -3
- package/apps/webauto/resources/container-library/weibo/weibo_search_page/login_anchor/container.json +30 -0
- package/apps/webauto/resources/container-library/weibo/weibo_search_page/search_bar/container.json +47 -0
- package/apps/webauto/resources/container-library/weibo/weibo_search_page/search_button/container.json +39 -0
- package/bin/camoufox-cli.mjs +61 -0
- package/bin/webauto.mjs +301 -54
- package/dist/modules/camo-backend/src/index.js +49 -1
- package/dist/modules/camo-backend/src/internal/BrowserSession.js +572 -3
- package/dist/modules/camo-backend/src/internal/SessionManager.js +13 -1
- package/dist/modules/camo-backend/src/internal/storage-paths.js +6 -0
- package/dist/modules/collection-manager/bloom-filter.js +91 -0
- package/dist/modules/collection-manager/date-utils.js +275 -0
- package/dist/modules/collection-manager/index.js +258 -0
- package/dist/modules/collection-manager/storage.js +195 -0
- package/dist/modules/collection-manager/types.js +47 -0
- package/dist/modules/logging/src/index.js +1 -1
- package/dist/modules/process-registry/index.js +230 -0
- package/dist/modules/rate-limiter/index.js +242 -0
- package/dist/modules/workflow/blocks/ExecuteWeiboSearchBlock.js +128 -0
- package/dist/modules/workflow/blocks/PersistXhsNoteBlock.js +7 -3
- package/dist/modules/workflow/blocks/RenderMarkdown.js +4 -1
- package/dist/modules/workflow/blocks/WeiboCollectCommentsBlock.js +282 -0
- package/dist/modules/workflow/blocks/WeiboCollectFromLinksBlock.js +283 -0
- package/dist/modules/workflow/blocks/WeiboCollectSearchLinksBlock.js +208 -0
- package/dist/modules/workflow/blocks/WeiboCollectTimelineListBlock.js +128 -0
- package/dist/modules/workflow/blocks/WeiboCollectUserPostsListBlock.js +127 -0
- package/dist/modules/workflow/blocks/helpers/downloadPaths.js +21 -0
- package/dist/modules/workflow/config/workflowRegistry.js +2 -0
- package/dist/modules/workflow/definitions/weibo-search-workflow-v1.js +47 -0
- package/dist/modules/workflow/src/runner.js +6 -0
- package/dist/modules/xiaohongshu/app/src/blocks/Phase34PersistDetailBlock.js +4 -0
- package/dist/modules/xiaohongshu/app/src/blocks/Phase3InteractBlock.js +2 -2
- package/dist/modules/xiaohongshu/app/src/blocks/helpers/sharding.js +123 -0
- package/dist/modules/xiaohongshu/app/src/container-registry/src/index.d.ts +37 -0
- package/dist/modules/xiaohongshu/app/src/container-registry/src/index.js +184 -0
- package/dist/modules/xiaohongshu/app/src/workflow/blocks/AnchorVerificationBlock.d.ts +31 -0
- package/dist/modules/xiaohongshu/app/src/workflow/blocks/AnchorVerificationBlock.js +71 -0
- package/dist/modules/xiaohongshu/app/src/workflow/blocks/DetectPageStateBlock.d.ts +48 -0
- package/dist/modules/xiaohongshu/app/src/workflow/blocks/DetectPageStateBlock.js +259 -0
- package/dist/modules/xiaohongshu/app/src/workflow/blocks/ErrorRecoveryBlock.d.ts +28 -0
- package/dist/modules/xiaohongshu/app/src/workflow/blocks/ErrorRecoveryBlock.js +319 -0
- package/dist/modules/xiaohongshu/app/src/workflow/blocks/WaitSearchPermitBlock.d.ts +36 -0
- package/dist/modules/xiaohongshu/app/src/workflow/blocks/WaitSearchPermitBlock.js +162 -0
- package/dist/modules/xiaohongshu/app/src/workflow/blocks/helpers/containerAnchors.d.ts +36 -0
- package/dist/modules/xiaohongshu/app/src/workflow/blocks/helpers/containerAnchors.js +301 -0
- package/dist/modules/xiaohongshu/app/src/workflow/blocks/helpers/operationLogger.d.ts +29 -0
- package/dist/modules/xiaohongshu/app/src/workflow/blocks/helpers/operationLogger.js +195 -0
- package/dist/modules/xiaohongshu/app/src/workflow/blocks/helpers/searchPageState.d.ts +25 -0
- package/dist/modules/xiaohongshu/app/src/workflow/blocks/helpers/searchPageState.js +164 -0
- package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/MatchCommentsBlock.d.ts +66 -0
- package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/MatchCommentsBlock.js +139 -0
- package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/Phase1EnsureServicesBlock.d.ts +16 -0
- package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/Phase1EnsureServicesBlock.js +36 -0
- package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/Phase1MonitorCookieBlock.d.ts +27 -0
- package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/Phase1MonitorCookieBlock.js +213 -0
- package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/Phase1StartProfileBlock.d.ts +18 -0
- package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/Phase1StartProfileBlock.js +121 -0
- package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/Phase2CollectLinksBlock.d.ts +34 -0
- package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/Phase2CollectLinksBlock.js +1249 -0
- package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/Phase2SearchBlock.d.ts +17 -0
- package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/Phase2SearchBlock.js +703 -0
- package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/Phase34CloseDetailBlock.d.ts +15 -0
- package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/Phase34CloseDetailBlock.js +41 -0
- package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/Phase34CloseTabsBlock.d.ts +26 -0
- package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/Phase34CloseTabsBlock.js +44 -0
- package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/Phase34CollectCommentsBlock.d.ts +29 -0
- package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/Phase34CollectCommentsBlock.js +150 -0
- package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/Phase34ExtractDetailBlock.d.ts +38 -0
- package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/Phase34ExtractDetailBlock.js +117 -0
- package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/Phase34OpenDetailBlock.d.ts +30 -0
- package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/Phase34OpenDetailBlock.js +102 -0
- package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/Phase34OpenTabsBlock.d.ts +23 -0
- package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/Phase34OpenTabsBlock.js +109 -0
- package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/Phase34PersistDetailBlock.d.ts +32 -0
- package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/Phase34PersistDetailBlock.js +117 -0
- package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/Phase34ProcessSingleNoteBlock.d.ts +35 -0
- package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/Phase34ProcessSingleNoteBlock.js +114 -0
- package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/Phase34ValidateLinksBlock.d.ts +34 -0
- package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/Phase34ValidateLinksBlock.js +90 -0
- package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/Phase3InteractBlock.d.ts +111 -0
- package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/Phase3InteractBlock.js +1009 -0
- package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/Phase4MultiTabHarvestBlock.d.ts +20 -0
- package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/Phase4MultiTabHarvestBlock.js +233 -0
- package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/ReplyInteractBlock.d.ts +48 -0
- package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/ReplyInteractBlock.js +291 -0
- package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/XhsDiscoverFallbackBlock.d.ts +23 -0
- package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/XhsDiscoverFallbackBlock.js +240 -0
- package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/helpers/commentMatchDsl.d.ts +55 -0
- package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/helpers/commentMatchDsl.js +126 -0
- package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/helpers/commentMatcher.d.ts +21 -0
- package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/helpers/commentMatcher.js +99 -0
- package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/helpers/evidence.d.ts +5 -0
- package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/helpers/evidence.js +27 -0
- package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/helpers/sharding.d.ts +37 -0
- package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/helpers/sharding.js +165 -0
- package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/helpers/xhsComments.d.ts +33 -0
- package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/blocks/helpers/xhsComments.js +270 -0
- package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/index.d.ts +9 -0
- package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/index.js +9 -0
- package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/utils/checkpoints.d.ts +50 -0
- package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/utils/checkpoints.js +222 -0
- package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/utils/controllerAction.d.ts +10 -0
- package/dist/modules/xiaohongshu/app/src/xiaohongshu/app/src/utils/controllerAction.js +43 -0
- package/dist/services/shared/serviceProcessLogger.js +1 -1
- package/dist/services/unified-api/server.js +105 -11
- package/modules/camo-backend/src/index.ts +46 -1
- package/modules/camo-backend/src/internal/BrowserSession.ts +619 -3
- package/modules/camo-backend/src/internal/SessionManager.ts +12 -1
- package/modules/camo-backend/src/internal/storage-paths.ts +5 -0
- package/modules/camo-runtime/src/autoscript/action-providers/xhs/comments.mjs +38 -2
- package/modules/camo-runtime/src/autoscript/action-providers/xhs/interaction.mjs +47 -2
- package/modules/camo-runtime/src/autoscript/action-providers/xhs/search.mjs +94 -11
- package/modules/camo-runtime/src/autoscript/action-providers/xhs.mjs +208 -2
- package/modules/camo-runtime/src/autoscript/runtime.mjs +7 -1
- package/modules/camo-runtime/src/autoscript/xhs-unified-template.mjs +76 -43
- package/modules/camo-runtime/src/container/runtime-core/operations/index.mjs +75 -1
- package/modules/camo-runtime/src/container/runtime-core/operations/selector-scripts.mjs +71 -4
- package/modules/camo-runtime/src/container/runtime-core/operations/tab-pool.mjs +183 -27
- package/modules/collection-manager/bloom-filter.ts +112 -0
- package/modules/collection-manager/date-utils.ts +316 -0
- package/modules/collection-manager/index.ts +309 -0
- package/modules/collection-manager/package.json +10 -0
- package/modules/collection-manager/storage.ts +174 -0
- package/modules/collection-manager/types.ts +156 -0
- package/modules/logging/src/index.ts +1 -1
- package/modules/process-registry/index.ts +284 -0
- package/modules/rate-limiter/index.ts +322 -0
- package/modules/state/src/paths.ts +9 -1
- package/modules/task-scheduler/index.ts +293 -0
- package/modules/workflow/blocks/ExecuteWeiboSearchBlock.ts +167 -0
- package/modules/workflow/blocks/PersistXhsNoteBlock.ts +7 -3
- package/modules/workflow/blocks/RenderMarkdown.ts +4 -1
- package/modules/workflow/blocks/WeiboCollectCommentsBlock.ts +339 -0
- package/modules/workflow/blocks/WeiboCollectFromLinksBlock.ts +338 -0
- package/modules/workflow/blocks/helpers/downloadPaths.ts +16 -0
- package/modules/workflow/config/workflowRegistry.ts +2 -0
- package/modules/workflow/definitions/weibo-search-workflow-v1.ts +47 -0
- package/modules/workflow/src/runner.ts +6 -0
- package/modules/xiaohongshu/app/src/blocks/Phase1StartProfileBlock.ts +1 -1
- package/modules/xiaohongshu/app/src/blocks/Phase34PersistDetailBlock.ts +4 -0
- package/modules/xiaohongshu/app/src/blocks/Phase3InteractBlock.ts +2 -3
- package/modules/xiaohongshu/app/src/blocks/helpers/sharding.ts +152 -0
- package/package.json +13 -4
- package/scripts/postinstall-resources.mjs +62 -0
- package/scripts/test/run-coverage.mjs +76 -0
- package/scripts/weibo/search.ts +49 -0
- package/services/shared/serviceProcessLogger.ts +1 -1
- package/services/unified-api/server.ts +98 -12
|
@@ -0,0 +1,309 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Collection Data Manager
|
|
3
|
+
*
|
|
4
|
+
* Unified data management for all platforms and collection types.
|
|
5
|
+
* Handles:
|
|
6
|
+
* - Deduplication via Bloom Filter
|
|
7
|
+
* - Fresh/Incremental modes
|
|
8
|
+
* - File storage with human-readable paths
|
|
9
|
+
* - Stats and persistence
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
import * as os from 'os';
|
|
13
|
+
import * as path from 'path';
|
|
14
|
+
import { BloomFilter } from './bloom-filter';
|
|
15
|
+
import { CollectionStorage } from './storage';
|
|
16
|
+
import { getCurrentTimestamp, parsePlatformDate } from './date-utils';
|
|
17
|
+
import type {
|
|
18
|
+
Platform,
|
|
19
|
+
CollectionSource,
|
|
20
|
+
CollectionMode,
|
|
21
|
+
CollectionMeta,
|
|
22
|
+
CollectionIdSpec,
|
|
23
|
+
PostRecord,
|
|
24
|
+
CommentRecord
|
|
25
|
+
} from './types';
|
|
26
|
+
import { buildCollectionId, parseCollectionId } from './types';
|
|
27
|
+
|
|
28
|
+
export interface CollectionManagerOptions {
|
|
29
|
+
platform: Platform;
|
|
30
|
+
env: string;
|
|
31
|
+
spec: CollectionIdSpec;
|
|
32
|
+
mode?: CollectionMode;
|
|
33
|
+
baseDir?: string;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
export interface CollectionStats {
|
|
37
|
+
totalPosts: number;
|
|
38
|
+
totalComments: number;
|
|
39
|
+
newPosts: number;
|
|
40
|
+
newComments: number;
|
|
41
|
+
duplicatesSkipped: number;
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
export class CollectionDataManager {
|
|
45
|
+
private platform: Platform;
|
|
46
|
+
private env: string;
|
|
47
|
+
private spec: CollectionIdSpec;
|
|
48
|
+
private mode: CollectionMode;
|
|
49
|
+
private collectionId: string;
|
|
50
|
+
private storage: CollectionStorage;
|
|
51
|
+
private bloomFilter: BloomFilter;
|
|
52
|
+
private meta: CollectionMeta | null = null;
|
|
53
|
+
private stats: CollectionStats = {
|
|
54
|
+
totalPosts: 0,
|
|
55
|
+
totalComments: 0,
|
|
56
|
+
newPosts: 0,
|
|
57
|
+
newComments: 0,
|
|
58
|
+
duplicatesSkipped: 0
|
|
59
|
+
};
|
|
60
|
+
|
|
61
|
+
constructor(options: CollectionManagerOptions) {
|
|
62
|
+
this.platform = options.platform;
|
|
63
|
+
this.env = options.env;
|
|
64
|
+
this.spec = options.spec;
|
|
65
|
+
this.mode = options.mode || 'incremental';
|
|
66
|
+
|
|
67
|
+
// Build human-readable collection ID
|
|
68
|
+
this.collectionId = buildCollectionId(options.spec);
|
|
69
|
+
|
|
70
|
+
// Default base directory
|
|
71
|
+
const baseDir = options.baseDir || path.join(os.homedir(), '.webauto', 'download');
|
|
72
|
+
|
|
73
|
+
this.storage = new CollectionStorage(baseDir, this.platform, this.env, this.collectionId);
|
|
74
|
+
this.bloomFilter = new BloomFilter(500000, 0.001); // 500k items, 0.1% false positive
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
/**
|
|
78
|
+
* Initialize collection manager
|
|
79
|
+
* - Load existing meta if available
|
|
80
|
+
* - Load existing bloom filter for incremental mode
|
|
81
|
+
* - Clear data for fresh mode
|
|
82
|
+
*/
|
|
83
|
+
async init(): Promise<void> {
|
|
84
|
+
await this.storage.init();
|
|
85
|
+
|
|
86
|
+
const existingMeta = await this.storage.readMeta();
|
|
87
|
+
|
|
88
|
+
if (this.mode === 'fresh') {
|
|
89
|
+
// Clear all existing data
|
|
90
|
+
await this.storage.clear();
|
|
91
|
+
this.meta = this.createMeta();
|
|
92
|
+
await this.storage.writeMeta(this.meta);
|
|
93
|
+
} else if (existingMeta) {
|
|
94
|
+
// Incremental mode with existing data
|
|
95
|
+
this.meta = existingMeta;
|
|
96
|
+
this.meta.updatedAt = new Date().toISOString();
|
|
97
|
+
|
|
98
|
+
// Load existing bloom filter state if available; fallback to rebuild from posts.
|
|
99
|
+
let loadedFromMeta = false;
|
|
100
|
+
if (existingMeta.bloomFilter) {
|
|
101
|
+
try {
|
|
102
|
+
this.bloomFilter = BloomFilter.import(existingMeta.bloomFilter, 500000);
|
|
103
|
+
loadedFromMeta = true;
|
|
104
|
+
} catch {
|
|
105
|
+
loadedFromMeta = false;
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
if (!loadedFromMeta) {
|
|
110
|
+
const posts = await this.storage.readPosts();
|
|
111
|
+
for (const post of posts) {
|
|
112
|
+
this.bloomFilter.add(post.id);
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
await this.storage.writeMeta(this.meta);
|
|
117
|
+
} else {
|
|
118
|
+
// Incremental mode, but no existing data
|
|
119
|
+
this.meta = this.createMeta();
|
|
120
|
+
await this.storage.writeMeta(this.meta);
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
// Initialize stats from existing data
|
|
124
|
+
const storageStats = await this.storage.getStats();
|
|
125
|
+
this.stats.totalPosts = storageStats.postCount;
|
|
126
|
+
this.stats.totalComments = storageStats.commentCount;
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
private createMeta(): CollectionMeta {
|
|
130
|
+
return {
|
|
131
|
+
platform: this.platform,
|
|
132
|
+
env: this.env,
|
|
133
|
+
collectionId: this.collectionId,
|
|
134
|
+
source: this.spec.source,
|
|
135
|
+
createdAt: new Date().toISOString(),
|
|
136
|
+
updatedAt: new Date().toISOString(),
|
|
137
|
+
totalPosts: 0,
|
|
138
|
+
totalComments: 0,
|
|
139
|
+
mode: this.mode,
|
|
140
|
+
...(this.spec.source === 'search' && { keyword: (this.spec as { keyword: string }).keyword }),
|
|
141
|
+
...(this.spec.source === 'user' && {
|
|
142
|
+
userId: (this.spec as { userId: string }).userId,
|
|
143
|
+
userName: (this.spec as { userName?: string }).userName
|
|
144
|
+
})
|
|
145
|
+
};
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
/**
|
|
149
|
+
* Check if post ID already exists (via bloom filter)
|
|
150
|
+
*/
|
|
151
|
+
hasPost(postId: string): boolean {
|
|
152
|
+
return this.bloomFilter.mightContain(postId);
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
/**
|
|
156
|
+
* Add a post if not duplicate
|
|
157
|
+
* Returns true if added, false if duplicate
|
|
158
|
+
*/
|
|
159
|
+
async addPost(post: PostRecord): Promise<boolean> {
|
|
160
|
+
// Check bloom filter
|
|
161
|
+
if (this.bloomFilter.mightContain(post.id)) {
|
|
162
|
+
this.stats.duplicatesSkipped++;
|
|
163
|
+
return false;
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
// Auto-fill collectedAt fields if not provided
|
|
167
|
+
if (!post.collectedAt) {
|
|
168
|
+
const ts = getCurrentTimestamp();
|
|
169
|
+
post.collectedAt = ts.collectedAt;
|
|
170
|
+
post.collectedAtLocal = ts.collectedAtLocal;
|
|
171
|
+
post.collectedDate = ts.collectedDate;
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
// Add to bloom filter and storage
|
|
175
|
+
this.bloomFilter.add(post.id);
|
|
176
|
+
await this.storage.appendPost(post);
|
|
177
|
+
|
|
178
|
+
this.stats.totalPosts++;
|
|
179
|
+
this.stats.newPosts++;
|
|
180
|
+
|
|
181
|
+
return true;
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
/**
|
|
185
|
+
* Add a comment (no deduplication for comments within same post)
|
|
186
|
+
* But we use postId:commentId as the key to dedupe
|
|
187
|
+
*/
|
|
188
|
+
async addComment(comment: CommentRecord): Promise<boolean> {
|
|
189
|
+
const commentKey = `${comment.postId}:${comment.id}`;
|
|
190
|
+
|
|
191
|
+
if (this.bloomFilter.mightContain(commentKey)) {
|
|
192
|
+
return false;
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
this.bloomFilter.add(commentKey);
|
|
196
|
+
await this.storage.appendComment(comment);
|
|
197
|
+
|
|
198
|
+
this.stats.totalComments++;
|
|
199
|
+
this.stats.newComments++;
|
|
200
|
+
|
|
201
|
+
return true;
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
/**
|
|
205
|
+
* Persist metadata and bloom filter state
|
|
206
|
+
*/
|
|
207
|
+
async persist(): Promise<void> {
|
|
208
|
+
if (!this.meta) return;
|
|
209
|
+
|
|
210
|
+
this.meta.updatedAt = new Date().toISOString();
|
|
211
|
+
this.meta.totalPosts = this.stats.totalPosts;
|
|
212
|
+
this.meta.totalComments = this.stats.totalComments;
|
|
213
|
+
this.meta.bloomFilter = this.bloomFilter.export();
|
|
214
|
+
|
|
215
|
+
await this.storage.writeMeta(this.meta);
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
/**
|
|
219
|
+
* Get current stats
|
|
220
|
+
*/
|
|
221
|
+
getStats(): CollectionStats {
|
|
222
|
+
return { ...this.stats };
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
/**
|
|
226
|
+
* Get collection metadata
|
|
227
|
+
*/
|
|
228
|
+
getMeta(): CollectionMeta | null {
|
|
229
|
+
return this.meta;
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
/**
|
|
233
|
+
* Get collection ID
|
|
234
|
+
*/
|
|
235
|
+
getCollectionId(): string {
|
|
236
|
+
return this.collectionId;
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
/**
|
|
240
|
+
* Get storage paths for external use (e.g., logging)
|
|
241
|
+
*/
|
|
242
|
+
getPaths(): {
|
|
243
|
+
collectionDir: string;
|
|
244
|
+
postsPath: string;
|
|
245
|
+
commentsPath: string;
|
|
246
|
+
linksPath: string;
|
|
247
|
+
runLogPath: string;
|
|
248
|
+
} {
|
|
249
|
+
return {
|
|
250
|
+
collectionDir: (this.storage as any).collectionDir,
|
|
251
|
+
postsPath: this.storage.getPostsPath(),
|
|
252
|
+
commentsPath: this.storage.getCommentsPath(),
|
|
253
|
+
linksPath: this.storage.getLinksPath(),
|
|
254
|
+
runLogPath: this.storage.getRunLogPath()
|
|
255
|
+
};
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
/**
|
|
259
|
+
* List all collections for this platform
|
|
260
|
+
*/
|
|
261
|
+
static async listCollections(
|
|
262
|
+
platform: Platform,
|
|
263
|
+
env: string,
|
|
264
|
+
baseDir?: string
|
|
265
|
+
): Promise<Array<{ collectionId: string; spec: CollectionIdSpec | null }>> {
|
|
266
|
+
const dir = baseDir || path.join(os.homedir(), '.webauto', 'download');
|
|
267
|
+
const ids = await CollectionStorage.listCollections(dir, platform, env);
|
|
268
|
+
|
|
269
|
+
return ids.map(id => ({
|
|
270
|
+
collectionId: id,
|
|
271
|
+
spec: parseCollectionId(id)
|
|
272
|
+
}));
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
/**
|
|
276
|
+
* Merge multiple collections into one
|
|
277
|
+
* Useful for combining timeline data from multiple dates
|
|
278
|
+
*/
|
|
279
|
+
static async mergeCollections(
|
|
280
|
+
sourceIds: string[],
|
|
281
|
+
targetId: string,
|
|
282
|
+
platform: Platform,
|
|
283
|
+
env: string,
|
|
284
|
+
baseDir?: string
|
|
285
|
+
): Promise<void> {
|
|
286
|
+
const dir = baseDir || path.join(os.homedir(), '.webauto', 'download');
|
|
287
|
+
const targetStorage = new CollectionStorage(dir, platform, env, targetId);
|
|
288
|
+
await targetStorage.init();
|
|
289
|
+
|
|
290
|
+
const seenPostIds = new Set<string>();
|
|
291
|
+
|
|
292
|
+
for (const sourceId of sourceIds) {
|
|
293
|
+
const sourceStorage = new CollectionStorage(dir, platform, env, sourceId);
|
|
294
|
+
const posts = await sourceStorage.readPosts();
|
|
295
|
+
|
|
296
|
+
for (const post of posts) {
|
|
297
|
+
if (!seenPostIds.has(post.id)) {
|
|
298
|
+
seenPostIds.add(post.id);
|
|
299
|
+
await targetStorage.appendPost(post);
|
|
300
|
+
}
|
|
301
|
+
}
|
|
302
|
+
}
|
|
303
|
+
}
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
export { buildCollectionId, parseCollectionId };
|
|
307
|
+
export type { CollectionIdSpec, CollectionMeta, PostRecord, CommentRecord };
|
|
308
|
+
|
|
309
|
+
export { getCurrentTimestamp, parsePlatformDate, extractWeiboPostDate } from "./date-utils.js";
|
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* File storage utilities for collection data
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
import * as fs from 'fs';
|
|
6
|
+
import * as path from 'path';
|
|
7
|
+
import type { CollectionMeta, PostRecord, CommentRecord } from './types';
|
|
8
|
+
|
|
9
|
+
export class CollectionStorage {
|
|
10
|
+
private baseDir: string;
|
|
11
|
+
private collectionDir: string;
|
|
12
|
+
private platform: string;
|
|
13
|
+
private env: string;
|
|
14
|
+
private collectionId: string;
|
|
15
|
+
|
|
16
|
+
constructor(baseDir: string, platform: string, env: string, collectionId: string) {
|
|
17
|
+
this.baseDir = baseDir;
|
|
18
|
+
this.platform = platform;
|
|
19
|
+
this.env = env;
|
|
20
|
+
this.collectionId = collectionId;
|
|
21
|
+
|
|
22
|
+
// Directory structure: baseDir/platform/env/collectionId/
|
|
23
|
+
this.collectionDir = path.join(baseDir, platform, env, collectionId);
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
/**
|
|
27
|
+
* Initialize storage directories
|
|
28
|
+
*/
|
|
29
|
+
async init(): Promise<void> {
|
|
30
|
+
await fs.promises.mkdir(this.collectionDir, { recursive: true });
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
/**
|
|
34
|
+
* Get paths for different files
|
|
35
|
+
*/
|
|
36
|
+
getMetaPath(): string {
|
|
37
|
+
return path.join(this.collectionDir, 'collection-meta.json');
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
getPostsPath(): string {
|
|
41
|
+
return path.join(this.collectionDir, 'posts.jsonl');
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
getCommentsPath(): string {
|
|
45
|
+
return path.join(this.collectionDir, 'comments.jsonl');
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
getLinksPath(): string {
|
|
49
|
+
return path.join(this.collectionDir, 'links.jsonl');
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
getRunLogPath(): string {
|
|
53
|
+
return path.join(this.collectionDir, 'run.log');
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
getEventsPath(): string {
|
|
57
|
+
return path.join(this.collectionDir, 'run-events.jsonl');
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
/**
|
|
61
|
+
* Read/write meta
|
|
62
|
+
*/
|
|
63
|
+
async readMeta(): Promise<CollectionMeta | null> {
|
|
64
|
+
const metaPath = this.getMetaPath();
|
|
65
|
+
try {
|
|
66
|
+
const data = await fs.promises.readFile(metaPath, 'utf-8');
|
|
67
|
+
return JSON.parse(data);
|
|
68
|
+
} catch {
|
|
69
|
+
return null;
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
async writeMeta(meta: CollectionMeta): Promise<void> {
|
|
74
|
+
const metaPath = this.getMetaPath();
|
|
75
|
+
await fs.promises.writeFile(metaPath, JSON.stringify(meta, null, 2), 'utf-8');
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
/**
|
|
79
|
+
* Append post to JSONL
|
|
80
|
+
*/
|
|
81
|
+
async appendPost(post: PostRecord): Promise<void> {
|
|
82
|
+
const line = JSON.stringify(post) + '\n';
|
|
83
|
+
await fs.promises.appendFile(this.getPostsPath(), line, 'utf-8');
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
/**
|
|
87
|
+
* Append comment to JSONL
|
|
88
|
+
*/
|
|
89
|
+
async appendComment(comment: CommentRecord): Promise<void> {
|
|
90
|
+
const line = JSON.stringify(comment) + '\n';
|
|
91
|
+
await fs.promises.appendFile(this.getCommentsPath(), line, 'utf-8');
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
/**
|
|
95
|
+
* Read all posts
|
|
96
|
+
*/
|
|
97
|
+
async readPosts(): Promise<PostRecord[]> {
|
|
98
|
+
try {
|
|
99
|
+
const data = await fs.promises.readFile(this.getPostsPath(), 'utf-8');
|
|
100
|
+
return data.trim().split('\n').map(line => JSON.parse(line));
|
|
101
|
+
} catch {
|
|
102
|
+
return [];
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
/**
|
|
107
|
+
* Read all comments
|
|
108
|
+
*/
|
|
109
|
+
async readComments(): Promise<CommentRecord[]> {
|
|
110
|
+
try {
|
|
111
|
+
const data = await fs.promises.readFile(this.getCommentsPath(), 'utf-8');
|
|
112
|
+
return data.trim().split('\n').map(line => JSON.parse(line));
|
|
113
|
+
} catch {
|
|
114
|
+
return [];
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
/**
|
|
119
|
+
* Count lines in JSONL file
|
|
120
|
+
*/
|
|
121
|
+
async countLines(filePath: string): Promise<number> {
|
|
122
|
+
try {
|
|
123
|
+
const data = await fs.promises.readFile(filePath, 'utf-8');
|
|
124
|
+
return data.trim().split('\n').filter(line => line.length > 0).length;
|
|
125
|
+
} catch {
|
|
126
|
+
return 0;
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
/**
|
|
131
|
+
* Get collection stats
|
|
132
|
+
*/
|
|
133
|
+
async getStats(): Promise<{ postCount: number; commentCount: number }> {
|
|
134
|
+
return {
|
|
135
|
+
postCount: await this.countLines(this.getPostsPath()),
|
|
136
|
+
commentCount: await this.countLines(this.getCommentsPath())
|
|
137
|
+
};
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
/**
|
|
141
|
+
* Clear all collection data (for fresh mode)
|
|
142
|
+
*/
|
|
143
|
+
async clear(): Promise<void> {
|
|
144
|
+
const files = [
|
|
145
|
+
this.getPostsPath(),
|
|
146
|
+
this.getCommentsPath(),
|
|
147
|
+
this.getLinksPath(),
|
|
148
|
+
this.getEventsPath()
|
|
149
|
+
];
|
|
150
|
+
|
|
151
|
+
for (const file of files) {
|
|
152
|
+
try {
|
|
153
|
+
await fs.promises.unlink(file);
|
|
154
|
+
} catch {
|
|
155
|
+
// File might not exist
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
/**
|
|
161
|
+
* List all collections for a platform/env
|
|
162
|
+
*/
|
|
163
|
+
static async listCollections(baseDir: string, platform: string, env: string): Promise<string[]> {
|
|
164
|
+
const dir = path.join(baseDir, platform, env);
|
|
165
|
+
try {
|
|
166
|
+
const entries = await fs.promises.readdir(dir, { withFileTypes: true });
|
|
167
|
+
return entries
|
|
168
|
+
.filter(e => e.isDirectory())
|
|
169
|
+
.map(e => e.name);
|
|
170
|
+
} catch {
|
|
171
|
+
return [];
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
}
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
/** Collection types for unified data management across platforms */
|
|
2
|
+
|
|
3
|
+
export type Platform = 'xiaohongshu' | 'weibo' | '1688';
|
|
4
|
+
|
|
5
|
+
export type CollectionSource = 'search' | 'timeline' | 'user' | 'note' | 'product';
|
|
6
|
+
|
|
7
|
+
export type CollectionMode = 'fresh' | 'incremental';
|
|
8
|
+
|
|
9
|
+
export interface CollectionMeta {
|
|
10
|
+
platform: Platform;
|
|
11
|
+
env: string;
|
|
12
|
+
/** Human-readable identifier - varies by source type */
|
|
13
|
+
collectionId: string;
|
|
14
|
+
/** Source type */
|
|
15
|
+
source: CollectionSource;
|
|
16
|
+
/** Creation timestamp */
|
|
17
|
+
createdAt: string;
|
|
18
|
+
/** Last update timestamp */
|
|
19
|
+
updatedAt: string;
|
|
20
|
+
/** Total posts collected */
|
|
21
|
+
totalPosts: number;
|
|
22
|
+
/** Total comments collected */
|
|
23
|
+
totalComments: number;
|
|
24
|
+
/** Collection mode */
|
|
25
|
+
mode: CollectionMode;
|
|
26
|
+
/** Bloom filter state (base64) */
|
|
27
|
+
bloomFilter?: string;
|
|
28
|
+
/** Optional: search keyword */
|
|
29
|
+
keyword?: string;
|
|
30
|
+
/** Optional: user ID for user monitoring */
|
|
31
|
+
userId?: string;
|
|
32
|
+
/** Optional: user name for display */
|
|
33
|
+
userName?: string;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
export interface PostRecord {
|
|
37
|
+
/** Unique ID from platform */
|
|
38
|
+
id: string;
|
|
39
|
+
/** Post URL */
|
|
40
|
+
url: string;
|
|
41
|
+
/** Author ID */
|
|
42
|
+
authorId?: string;
|
|
43
|
+
/** Author name */
|
|
44
|
+
authorName?: string;
|
|
45
|
+
/** Post content */
|
|
46
|
+
content?: string;
|
|
47
|
+
/** Collected timestamp (ISO 8601 UTC) */
|
|
48
|
+
collectedAt: string;
|
|
49
|
+
/** Collected timestamp (local with timezone) */
|
|
50
|
+
collectedAtLocal?: string;
|
|
51
|
+
/** Collected date (YYYY-MM-DD, local) */
|
|
52
|
+
collectedDate?: string;
|
|
53
|
+
/** Published date extracted from content (YYYY-MM-DD) */
|
|
54
|
+
publishedDate?: string;
|
|
55
|
+
/** Published time extracted from content (HH:MM) */
|
|
56
|
+
publishedTime?: string;
|
|
57
|
+
/** Full published datetime text */
|
|
58
|
+
publishedAtText?: string;
|
|
59
|
+
/** Raw data from platform */
|
|
60
|
+
raw?: Record<string, unknown>;
|
|
61
|
+
/** Comments count */
|
|
62
|
+
commentsCount?: number;
|
|
63
|
+
/** Likes count */
|
|
64
|
+
likesCount?: number;
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
export interface CommentRecord {
|
|
68
|
+
/** Unique ID */
|
|
69
|
+
id: string;
|
|
70
|
+
/** Parent post ID */
|
|
71
|
+
postId: string;
|
|
72
|
+
/** Author ID */
|
|
73
|
+
authorId?: string;
|
|
74
|
+
/** Author name */
|
|
75
|
+
authorName?: string;
|
|
76
|
+
/** Comment content */
|
|
77
|
+
content?: string;
|
|
78
|
+
/** Parent comment ID (for nested comments) */
|
|
79
|
+
parentId?: string;
|
|
80
|
+
/** Reply to user name */
|
|
81
|
+
replyToName?: string;
|
|
82
|
+
/** Collected timestamp */
|
|
83
|
+
collectedAt: string;
|
|
84
|
+
/** Raw data */
|
|
85
|
+
raw?: Record<string, unknown>;
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
/**
|
|
89
|
+
* Collection ID naming conventions:
|
|
90
|
+
*
|
|
91
|
+
* search: "search:<keyword>" -> search:春晚
|
|
92
|
+
* timeline: "timeline:<date>" -> timeline:2026-02-20
|
|
93
|
+
* user: "user:<userId>:<userName>" -> user:1234567890:张三
|
|
94
|
+
* note: "note:<noteId>" -> note:abc123
|
|
95
|
+
* product: "product:<keyword>" -> product:女装
|
|
96
|
+
*/
|
|
97
|
+
export type CollectionIdSpec = {
|
|
98
|
+
source: 'search';
|
|
99
|
+
keyword: string;
|
|
100
|
+
} | {
|
|
101
|
+
source: 'timeline';
|
|
102
|
+
date: string; // YYYY-MM-DD
|
|
103
|
+
} | {
|
|
104
|
+
source: 'user';
|
|
105
|
+
userId: string;
|
|
106
|
+
userName?: string;
|
|
107
|
+
} | {
|
|
108
|
+
source: 'note';
|
|
109
|
+
noteId: string;
|
|
110
|
+
} | {
|
|
111
|
+
source: 'product';
|
|
112
|
+
keyword: string;
|
|
113
|
+
};
|
|
114
|
+
|
|
115
|
+
/**
|
|
116
|
+
* Build collection ID from spec
|
|
117
|
+
*/
|
|
118
|
+
export function buildCollectionId(spec: CollectionIdSpec): string {
|
|
119
|
+
switch (spec.source) {
|
|
120
|
+
case 'search':
|
|
121
|
+
return `search:${spec.keyword}`;
|
|
122
|
+
case 'timeline':
|
|
123
|
+
return `timeline:${spec.date}`;
|
|
124
|
+
case 'user':
|
|
125
|
+
const safeName = spec.userName?.replace(/[\/\\:*?"<>|]/g, '_') || 'unknown';
|
|
126
|
+
return `user:${spec.userId}:${safeName}`;
|
|
127
|
+
case 'note':
|
|
128
|
+
return `note:${spec.noteId}`;
|
|
129
|
+
case 'product':
|
|
130
|
+
return `product:${spec.keyword}`;
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
/**
|
|
135
|
+
* Parse collection ID back to spec
|
|
136
|
+
*/
|
|
137
|
+
export function parseCollectionId(id: string): CollectionIdSpec | null {
|
|
138
|
+
const parts = id.split(':');
|
|
139
|
+
if (parts.length < 2) return null;
|
|
140
|
+
|
|
141
|
+
const source = parts[0] as CollectionSource;
|
|
142
|
+
switch (source) {
|
|
143
|
+
case 'search':
|
|
144
|
+
return { source: 'search', keyword: parts.slice(1).join(':') };
|
|
145
|
+
case 'timeline':
|
|
146
|
+
return { source: 'timeline', date: parts[1] };
|
|
147
|
+
case 'user':
|
|
148
|
+
return { source: 'user', userId: parts[1], userName: parts[2] };
|
|
149
|
+
case 'note':
|
|
150
|
+
return { source: 'note', noteId: parts[1] };
|
|
151
|
+
case 'product':
|
|
152
|
+
return { source: 'product', keyword: parts.slice(1).join(':') };
|
|
153
|
+
default:
|
|
154
|
+
return null;
|
|
155
|
+
}
|
|
156
|
+
}
|
|
@@ -53,7 +53,7 @@ export function logDebug(module: string, event: string, data: Record<string, any
|
|
|
53
53
|
data,
|
|
54
54
|
};
|
|
55
55
|
try {
|
|
56
|
-
fs.appendFileSync(DEBUG_LOG_FILE, `${JSON.stringify(entry)}\n
|
|
56
|
+
fs.appendFileSync(DEBUG_LOG_FILE, `${JSON.stringify(entry)}\n`, 'utf8');
|
|
57
57
|
} catch {
|
|
58
58
|
// ignore
|
|
59
59
|
}
|