cli-community-intelligence 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.js +6523 -0
- package/dist/index.js +844 -0
- package/package.json +17 -0
- package/schema.sql +66 -0
package/package.json
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "cli-community-intelligence",
|
|
3
|
+
"version": "0.1.5",
|
|
4
|
+
"description": "Community intelligence scraper for construction industry market research",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"main": "dist/index.js",
|
|
7
|
+
"bin": {
|
|
8
|
+
"community-intelligence": "dist/cli.js"
|
|
9
|
+
},
|
|
10
|
+
"repository": {
|
|
11
|
+
"type": "git",
|
|
12
|
+
"url": "https://github.com/graffio/graffio-monorepo.git"
|
|
13
|
+
},
|
|
14
|
+
"dependencies": {
|
|
15
|
+
"better-sqlite3": "^8.7.0"
|
|
16
|
+
}
|
|
17
|
+
}
|
package/schema.sql
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
-- =====================================================
|
|
2
|
+
-- Community Intelligence Corpus Schema
|
|
3
|
+
-- =====================================================
|
|
4
|
+
--
|
|
5
|
+
-- Stores scraped posts and comments from online communities.
|
|
6
|
+
-- Column names use camelCase to match JavaScript conventions.
|
|
7
|
+
--
|
|
8
|
+
|
|
9
|
+
CREATE TABLE IF NOT EXISTS posts (
|
|
10
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
11
|
+
source TEXT NOT NULL, -- 'reddit', 'youtube', 'forum'
|
|
12
|
+
sourceId TEXT NOT NULL, -- Original ID from source platform
|
|
13
|
+
channel TEXT NOT NULL, -- Subreddit, YouTube channel, forum name
|
|
14
|
+
author TEXT, -- Username (may be null if deleted)
|
|
15
|
+
title TEXT, -- Post title (null for comments)
|
|
16
|
+
body TEXT NOT NULL, -- Post or comment text content
|
|
17
|
+
parentId TEXT, -- Parent post sourceId (for comments)
|
|
18
|
+
url TEXT, -- Permalink
|
|
19
|
+
score INTEGER, -- Upvotes, likes, etc.
|
|
20
|
+
createdAt TEXT NOT NULL, -- ISO 8601 timestamp from source
|
|
21
|
+
scrapedAt TEXT NOT NULL DEFAULT (datetime('now')), -- When we scraped it
|
|
22
|
+
postType TEXT NOT NULL CHECK (postType IN ('post', 'comment'))
|
|
23
|
+
);
|
|
24
|
+
|
|
25
|
+
CREATE UNIQUE INDEX IF NOT EXISTS idx_posts_source_sourceId ON posts(source, sourceId);
|
|
26
|
+
CREATE INDEX IF NOT EXISTS idx_posts_channel ON posts(channel);
|
|
27
|
+
CREATE INDEX IF NOT EXISTS idx_posts_createdAt ON posts(createdAt);
|
|
28
|
+
CREATE INDEX IF NOT EXISTS idx_posts_source_channel_createdAt ON posts(source, channel, createdAt DESC);
|
|
29
|
+
|
|
30
|
+
-- =====================================================
|
|
31
|
+
-- Scrape Runs — logs each scrape operation for analytics
|
|
32
|
+
-- =====================================================
|
|
33
|
+
|
|
34
|
+
CREATE TABLE IF NOT EXISTS scrape_runs (
|
|
35
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
36
|
+
source TEXT NOT NULL, -- 'reddit', 'youtube', 'forum'
|
|
37
|
+
channel TEXT NOT NULL, -- Subreddit, YouTube channel, forum name
|
|
38
|
+
startedAt TEXT NOT NULL, -- ISO 8601 timestamp
|
|
39
|
+
completedAt TEXT NOT NULL, -- ISO 8601 timestamp
|
|
40
|
+
postsFound INTEGER NOT NULL DEFAULT 0,
|
|
41
|
+
postsInserted INTEGER NOT NULL DEFAULT 0,
|
|
42
|
+
commentsFound INTEGER NOT NULL DEFAULT 0,
|
|
43
|
+
commentsInserted INTEGER NOT NULL DEFAULT 0
|
|
44
|
+
);
|
|
45
|
+
|
|
46
|
+
CREATE INDEX IF NOT EXISTS idx_scrape_runs_source_channel ON scrape_runs(source, channel);
|
|
47
|
+
|
|
48
|
+
-- =====================================================
|
|
49
|
+
-- Revisit Queue — tracks posts needing follow-up scrapes
|
|
50
|
+
-- =====================================================
|
|
51
|
+
--
|
|
52
|
+
-- One entry per post. reason is the most specific: has_more > high_score > recent.
|
|
53
|
+
-- Exit conditions (OR): post age > threshold OR unchangedCount > threshold.
|
|
54
|
+
--
|
|
55
|
+
|
|
56
|
+
CREATE TABLE IF NOT EXISTS revisit_queue (
|
|
57
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
58
|
+
source TEXT NOT NULL, -- 'reddit', 'youtube', 'forum'
|
|
59
|
+
sourceId TEXT NOT NULL, -- Post sourceId from the posts table
|
|
60
|
+
reason TEXT NOT NULL CHECK (reason IN ('recent', 'high_score', 'has_more')),
|
|
61
|
+
moreCount INTEGER NOT NULL DEFAULT 0, -- Count of "more" placeholder objects in comment tree
|
|
62
|
+
lastCheckedAt TEXT, -- ISO 8601 timestamp of last revisit
|
|
63
|
+
unchangedCount INTEGER NOT NULL DEFAULT 0 -- Consecutive revisits with zero new comments
|
|
64
|
+
);
|
|
65
|
+
|
|
66
|
+
CREATE UNIQUE INDEX IF NOT EXISTS idx_revisit_queue_source_sourceId ON revisit_queue(source, sourceId);
|