crushdataai 1.2.8 → 1.2.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. package/assets/{antigravity → .agent/workflows}/data-analyst.md +1 -0
  2. package/assets/{claude → .claude/skills/data-analyst}/SKILL.md +5 -2
  3. package/assets/{cursor → .cursor/commands}/data-analyst.md +5 -2
  4. package/assets/{kiro → .kiro/steering}/data-analyst.md +5 -0
  5. package/assets/.shared/data-analyst/data/charts.csv +31 -0
  6. package/assets/.shared/data-analyst/data/cleaning.csv +21 -0
  7. package/assets/.shared/data-analyst/data/workflows.csv +51 -0
  8. package/assets/.shared/data-analyst/databases.csv +35 -0
  9. package/assets/.shared/data-analyst/industries/ecommerce.csv +25 -0
  10. package/assets/.shared/data-analyst/industries/finance.csv +24 -0
  11. package/assets/.shared/data-analyst/industries/marketing.csv +25 -0
  12. package/assets/.shared/data-analyst/industries/saas.csv +24 -0
  13. package/assets/.shared/data-analyst/metrics.csv +74 -0
  14. package/assets/.shared/data-analyst/python-patterns.csv +31 -0
  15. package/assets/.shared/data-analyst/report-ux.csv +26 -0
  16. package/assets/.shared/data-analyst/scripts/__pycache__/core.cpython-311.pyc +0 -0
  17. package/assets/.shared/data-analyst/scripts/core.py +238 -0
  18. package/assets/.shared/data-analyst/scripts/search.py +61 -0
  19. package/assets/.shared/data-analyst/sql-patterns.csv +36 -0
  20. package/assets/.shared/data-analyst/validation.csv +21 -0
  21. package/assets/{windsurf → .windsurf/workflows}/data-analyst.md +5 -0
  22. package/package.json +1 -1
  23. package/ui/assets/{index-Ba1mRihD.js → index-DK2tLINh.js} +1 -1
  24. package/ui/index.html +1 -1
  25. /package/assets/{shared → .claude/skills/data-analyst/data}/charts.csv +0 -0
  26. /package/assets/{shared → .claude/skills/data-analyst/data}/cleaning.csv +0 -0
  27. /package/assets/{shared → .claude/skills/data-analyst}/data/databases.csv +0 -0
  28. /package/assets/{shared → .claude/skills/data-analyst}/data/industries/ecommerce.csv +0 -0
  29. /package/assets/{shared → .claude/skills/data-analyst}/data/industries/finance.csv +0 -0
  30. /package/assets/{shared → .claude/skills/data-analyst}/data/industries/marketing.csv +0 -0
  31. /package/assets/{shared → .claude/skills/data-analyst}/data/industries/saas.csv +0 -0
  32. /package/assets/{shared → .claude/skills/data-analyst}/data/metrics.csv +0 -0
  33. /package/assets/{shared → .claude/skills/data-analyst}/data/python-patterns.csv +0 -0
  34. /package/assets/{shared → .claude/skills/data-analyst}/data/report-ux.csv +0 -0
  35. /package/assets/{shared → .claude/skills/data-analyst}/data/sql-patterns.csv +0 -0
  36. /package/assets/{shared → .claude/skills/data-analyst}/data/validation.csv +0 -0
  37. /package/assets/{shared → .claude/skills/data-analyst}/data/workflows.csv +0 -0
  38. /package/assets/{shared → .claude/skills/data-analyst}/scripts/__pycache__/core.cpython-311.pyc +0 -0
  39. /package/assets/{shared → .claude/skills/data-analyst}/scripts/core.py +0 -0
  40. /package/assets/{shared → .claude/skills/data-analyst}/scripts/search.py +0 -0
  41. /package/assets/{copilot → .github/prompts}/data-analyst.prompt.md +0 -0
  42. /package/assets/{shared/data → .shared/data-analyst}/charts.csv +0 -0
  43. /package/assets/{shared/data → .shared/data-analyst}/cleaning.csv +0 -0
  44. /package/assets/{shared → .shared/data-analyst/data}/databases.csv +0 -0
  45. /package/assets/{shared → .shared/data-analyst/data}/industries/ecommerce.csv +0 -0
  46. /package/assets/{shared → .shared/data-analyst/data}/industries/finance.csv +0 -0
  47. /package/assets/{shared → .shared/data-analyst/data}/industries/marketing.csv +0 -0
  48. /package/assets/{shared → .shared/data-analyst/data}/industries/saas.csv +0 -0
  49. /package/assets/{shared → .shared/data-analyst/data}/metrics.csv +0 -0
  50. /package/assets/{shared → .shared/data-analyst/data}/python-patterns.csv +0 -0
  51. /package/assets/{shared → .shared/data-analyst/data}/report-ux.csv +0 -0
  52. /package/assets/{shared → .shared/data-analyst/data}/sql-patterns.csv +0 -0
  53. /package/assets/{shared → .shared/data-analyst/data}/validation.csv +0 -0
  54. /package/assets/{shared → .shared/data-analyst}/workflows.csv +0 -0
@@ -0,0 +1,238 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ CrushData AI Core - BM25 search engine for data analyst workflows
5
+ """
6
+
7
+ import csv
8
+ import re
9
+ from pathlib import Path
10
+ from math import log
11
+ from collections import defaultdict
12
+
13
+ # ============ CONFIGURATION ============
14
+ DATA_DIR = Path(__file__).parent.parent / "data"
15
+ MAX_RESULTS = 3
16
+
17
+ CSV_CONFIG = {
18
+ "workflow": {
19
+ "file": "workflows.csv",
20
+ "search_cols": ["Workflow Name", "Step Name", "Description", "Questions to Ask"],
21
+ "output_cols": ["Workflow Name", "Step Number", "Step Name", "Description", "Questions to Ask", "Tools/Commands", "Outputs", "Common Mistakes"]
22
+ },
23
+ "metric": {
24
+ "file": "metrics.csv",
25
+ "search_cols": ["Metric Name", "Abbreviation", "Industry", "Interpretation"],
26
+ "output_cols": ["Metric Name", "Abbreviation", "Industry", "Formula", "Interpretation", "Good Benchmark", "Related Metrics", "Visualization"]
27
+ },
28
+ "chart": {
29
+ "file": "charts.csv",
30
+ "search_cols": ["Chart Type", "Best For", "Data Type", "Comparison Type"],
31
+ "output_cols": ["Chart Type", "Best For", "Data Type", "Comparison Type", "Python Code", "Color Guidance", "Accessibility", "Dashboard Tip"]
32
+ },
33
+ "cleaning": {
34
+ "file": "cleaning.csv",
35
+ "search_cols": ["Issue Type", "Detection Method", "Solution"],
36
+ "output_cols": ["Issue Type", "Detection Method", "Solution", "Python Code", "SQL Code", "Impact"]
37
+ },
38
+ "sql": {
39
+ "file": "sql-patterns.csv",
40
+ "search_cols": ["Pattern Name", "Use Case", "SQL Code"],
41
+ "output_cols": ["Pattern Name", "Use Case", "SQL Code", "PostgreSQL", "BigQuery", "Performance"]
42
+ },
43
+ "python": {
44
+ "file": "python-patterns.csv",
45
+ "search_cols": ["Pattern Name", "Use Case", "pandas Code"],
46
+ "output_cols": ["Pattern Name", "Use Case", "pandas Code", "polars Code", "Performance"]
47
+ },
48
+ "database": {
49
+ "file": "databases.csv",
50
+ "search_cols": ["Database", "Category", "Guideline", "Do", "Don't"],
51
+ "output_cols": ["Database", "Category", "Guideline", "Do", "Don't", "Code Example"]
52
+ },
53
+ "report": {
54
+ "file": "report-ux.csv",
55
+ "search_cols": ["Category", "Guideline", "Do", "Don't"],
56
+ "output_cols": ["Category", "Guideline", "Do", "Don't", "Example"]
57
+ },
58
+ "validation": {
59
+ "file": "validation.csv",
60
+ "search_cols": ["Mistake Type", "Description", "Symptoms"],
61
+ "output_cols": ["Mistake Type", "Description", "Symptoms", "Prevention Query", "User Question"]
62
+ }
63
+ }
64
+
65
+ INDUSTRY_CONFIG = {
66
+ "saas": {"file": "industries/saas.csv"},
67
+ "ecommerce": {"file": "industries/ecommerce.csv"},
68
+ "finance": {"file": "industries/finance.csv"},
69
+ "marketing": {"file": "industries/marketing.csv"}
70
+ }
71
+
72
+ # Common columns for all industry files
73
+ _INDUSTRY_COLS = {
74
+ "search_cols": ["Metric Name", "Abbreviation", "Category", "Interpretation"],
75
+ "output_cols": ["Metric Name", "Abbreviation", "Category", "Formula", "Interpretation", "Good Benchmark", "Related Metrics", "Visualization"]
76
+ }
77
+
78
+ AVAILABLE_INDUSTRIES = list(INDUSTRY_CONFIG.keys())
79
+
80
+
81
+ # ============ BM25 IMPLEMENTATION ============
82
+ class BM25:
83
+ """BM25 ranking algorithm for text search"""
84
+
85
+ def __init__(self, k1=1.5, b=0.75):
86
+ self.k1 = k1
87
+ self.b = b
88
+ self.corpus = []
89
+ self.doc_lengths = []
90
+ self.avgdl = 0
91
+ self.idf = {}
92
+ self.doc_freqs = defaultdict(int)
93
+ self.N = 0
94
+
95
+ def tokenize(self, text):
96
+ """Lowercase, split, remove punctuation, filter short words"""
97
+ text = re.sub(r'[^\w\s]', ' ', str(text).lower())
98
+ return [w for w in text.split() if len(w) > 2]
99
+
100
+ def fit(self, documents):
101
+ """Build BM25 index from documents"""
102
+ self.corpus = [self.tokenize(doc) for doc in documents]
103
+ self.N = len(self.corpus)
104
+ if self.N == 0:
105
+ return
106
+ self.doc_lengths = [len(doc) for doc in self.corpus]
107
+ self.avgdl = sum(self.doc_lengths) / self.N
108
+
109
+ for doc in self.corpus:
110
+ seen = set()
111
+ for word in doc:
112
+ if word not in seen:
113
+ self.doc_freqs[word] += 1
114
+ seen.add(word)
115
+
116
+ for word, freq in self.doc_freqs.items():
117
+ self.idf[word] = log((self.N - freq + 0.5) / (freq + 0.5) + 1)
118
+
119
+ def score(self, query):
120
+ """Score all documents against query"""
121
+ query_tokens = self.tokenize(query)
122
+ scores = []
123
+
124
+ for idx, doc in enumerate(self.corpus):
125
+ score = 0
126
+ doc_len = self.doc_lengths[idx]
127
+ term_freqs = defaultdict(int)
128
+ for word in doc:
129
+ term_freqs[word] += 1
130
+
131
+ for token in query_tokens:
132
+ if token in self.idf:
133
+ tf = term_freqs[token]
134
+ idf = self.idf[token]
135
+ numerator = tf * (self.k1 + 1)
136
+ denominator = tf + self.k1 * (1 - self.b + self.b * doc_len / self.avgdl)
137
+ score += idf * numerator / denominator
138
+
139
+ scores.append((idx, score))
140
+
141
+ return sorted(scores, key=lambda x: x[1], reverse=True)
142
+
143
+
144
+ # ============ SEARCH FUNCTIONS ============
145
+ def _load_csv(filepath):
146
+ """Load CSV and return list of dicts"""
147
+ with open(filepath, 'r', encoding='utf-8') as f:
148
+ return list(csv.DictReader(f))
149
+
150
+
151
+ def _search_csv(filepath, search_cols, output_cols, query, max_results):
152
+ """Core search function using BM25"""
153
+ if not filepath.exists():
154
+ return []
155
+
156
+ data = _load_csv(filepath)
157
+
158
+ # Build documents from search columns
159
+ documents = [" ".join(str(row.get(col, "")) for col in search_cols) for row in data]
160
+
161
+ # BM25 search
162
+ bm25 = BM25()
163
+ bm25.fit(documents)
164
+ ranked = bm25.score(query)
165
+
166
+ # Get top results with score > 0
167
+ results = []
168
+ for idx, score in ranked[:max_results]:
169
+ if score > 0:
170
+ row = data[idx]
171
+ results.append({col: row.get(col, "") for col in output_cols if col in row})
172
+
173
+ return results
174
+
175
+
176
+ def detect_domain(query):
177
+ """Auto-detect the most relevant domain from query"""
178
+ query_lower = query.lower()
179
+
180
+ domain_keywords = {
181
+ "workflow": ["workflow", "process", "step", "eda", "dashboard", "cohort", "funnel", "analysis", "pipeline"],
182
+ "metric": ["metric", "kpi", "mrr", "arr", "churn", "cac", "ltv", "conversion", "rate", "ratio"],
183
+ "chart": ["chart", "graph", "visualization", "plot", "bar", "line", "pie", "heatmap", "scatter"],
184
+ "cleaning": ["clean", "missing", "null", "duplicate", "outlier", "impute", "data quality"],
185
+ "sql": ["sql", "query", "join", "window", "aggregate", "cte", "subquery", "partition"],
186
+ "python": ["python", "pandas", "polars", "dataframe", "pivot", "groupby", "merge"],
187
+ "database": ["postgres", "bigquery", "snowflake", "mysql", "database", "connection", "warehouse"],
188
+ "report": ["dashboard", "report", "layout", "ux", "design", "color", "visual"],
189
+ "validation": ["mistake", "error", "sanity", "check", "validate", "verify", "wrong"]
190
+ }
191
+
192
+ scores = {domain: sum(1 for kw in keywords if kw in query_lower) for domain, keywords in domain_keywords.items()}
193
+ best = max(scores, key=scores.get)
194
+ return best if scores[best] > 0 else "workflow"
195
+
196
+
197
+ def search(query, domain=None, max_results=MAX_RESULTS):
198
+ """Main search function with auto-domain detection"""
199
+ if domain is None:
200
+ domain = detect_domain(query)
201
+
202
+ config = CSV_CONFIG.get(domain, CSV_CONFIG["workflow"])
203
+ filepath = DATA_DIR / config["file"]
204
+
205
+ if not filepath.exists():
206
+ return {"error": f"File not found: {filepath}", "domain": domain}
207
+
208
+ results = _search_csv(filepath, config["search_cols"], config["output_cols"], query, max_results)
209
+
210
+ return {
211
+ "domain": domain,
212
+ "query": query,
213
+ "file": config["file"],
214
+ "count": len(results),
215
+ "results": results
216
+ }
217
+
218
+
219
+ def search_industry(query, industry, max_results=MAX_RESULTS):
220
+ """Search industry-specific metrics"""
221
+ if industry not in INDUSTRY_CONFIG:
222
+ return {"error": f"Unknown industry: {industry}. Available: {', '.join(AVAILABLE_INDUSTRIES)}"}
223
+
224
+ filepath = DATA_DIR / INDUSTRY_CONFIG[industry]["file"]
225
+
226
+ if not filepath.exists():
227
+ return {"error": f"Industry file not found: {filepath}", "industry": industry}
228
+
229
+ results = _search_csv(filepath, _INDUSTRY_COLS["search_cols"], _INDUSTRY_COLS["output_cols"], query, max_results)
230
+
231
+ return {
232
+ "domain": "industry",
233
+ "industry": industry,
234
+ "query": query,
235
+ "file": INDUSTRY_CONFIG[industry]["file"],
236
+ "count": len(results),
237
+ "results": results
238
+ }
@@ -0,0 +1,61 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ CrushData AI Search - CLI entry point for data analyst search
5
+ Usage: python search.py "<query>" [--domain <domain>] [--industry <industry>] [--max-results 3]
6
+
7
+ Domains: workflow, metric, chart, cleaning, sql, python, database, report, validation
8
+ Industries: saas, ecommerce, finance, marketing
9
+ """
10
+
11
+ import argparse
12
+ from core import CSV_CONFIG, AVAILABLE_INDUSTRIES, MAX_RESULTS, search, search_industry
13
+
14
+
15
+ def format_output(result):
16
+ """Format results for AI consumption (token-optimized)"""
17
+ if "error" in result:
18
+ return f"Error: {result['error']}"
19
+
20
+ output = []
21
+ if result.get("industry"):
22
+ output.append(f"## CrushData AI Industry Metrics")
23
+ output.append(f"**Industry:** {result['industry']} | **Query:** {result['query']}")
24
+ else:
25
+ output.append(f"## CrushData AI Search Results")
26
+ output.append(f"**Domain:** {result['domain']} | **Query:** {result['query']}")
27
+ output.append(f"**Source:** {result['file']} | **Found:** {result['count']} results\n")
28
+
29
+ for i, row in enumerate(result['results'], 1):
30
+ output.append(f"### Result {i}")
31
+ for key, value in row.items():
32
+ value_str = str(value)
33
+ if len(value_str) > 300:
34
+ value_str = value_str[:300] + "..."
35
+ output.append(f"- **{key}:** {value_str}")
36
+ output.append("")
37
+
38
+ return "\n".join(output)
39
+
40
+
41
+ if __name__ == "__main__":
42
+ parser = argparse.ArgumentParser(description="CrushData AI Search")
43
+ parser.add_argument("query", help="Search query")
44
+ parser.add_argument("--domain", "-d", choices=list(CSV_CONFIG.keys()), help="Search domain")
45
+ parser.add_argument("--industry", "-i", choices=AVAILABLE_INDUSTRIES, help="Industry-specific search (saas, ecommerce, finance, marketing)")
46
+ parser.add_argument("--max-results", "-n", type=int, default=MAX_RESULTS, help="Max results (default: 3)")
47
+ parser.add_argument("--json", action="store_true", help="Output as JSON")
48
+
49
+ args = parser.parse_args()
50
+
51
+ # Industry search takes priority
52
+ if args.industry:
53
+ result = search_industry(args.query, args.industry, args.max_results)
54
+ else:
55
+ result = search(args.query, args.domain, args.max_results)
56
+
57
+ if args.json:
58
+ import json
59
+ print(json.dumps(result, indent=2, ensure_ascii=False))
60
+ else:
61
+ print(format_output(result))
@@ -0,0 +1,36 @@
1
+ Pattern Name,Use Case,SQL Code,PostgreSQL,BigQuery,Performance
2
+ Running Total,"Cumulative sum over time","SUM(value) OVER (ORDER BY date ROWS UNBOUNDED PRECEDING)","Same","Same","Efficient with index on date column"
3
+ Running Average,"Moving average over all prior rows","AVG(value) OVER (ORDER BY date ROWS UNBOUNDED PRECEDING)","Same","Same","Consider fixed window for performance"
4
+ Rolling Window Average,"N-period moving average","AVG(value) OVER (ORDER BY date ROWS BETWEEN 6 PRECEDING AND CURRENT ROW)","Same","Same","Fixed window more efficient than unbounded"
5
+ Lag Previous Value,"Compare to previous row","LAG(value, 1) OVER (ORDER BY date)","Same","Same","Useful for period-over-period calculations"
6
+ Lead Next Value,"Look ahead to next row","LEAD(value, 1) OVER (ORDER BY date)","Same","Same","Use for forward-looking comparisons"
7
+ Year over Year,"Compare to same period last year","LAG(value, 12) OVER (ORDER BY month) for monthly; or self-join on date - INTERVAL '1 year'","Same; use date_trunc('year', date)","DATE_SUB(date, INTERVAL 1 YEAR)","Index on date; pre-aggregate to month level"
8
+ Month over Month,"Compare to previous month","LAG(value, 1) OVER (ORDER BY month)","Same","Same","Pre-aggregate daily to monthly first"
9
+ Percent Change,"Calculate growth rate","(value - LAG(value, 1) OVER (ORDER BY date)) / NULLIF(LAG(value, 1) OVER (ORDER BY date), 0) * 100","Same","Same","Handle divide by zero with NULLIF"
10
+ Rank,"Rank rows by value","RANK() OVER (ORDER BY value DESC)","Same","Same","Gaps in ranking for ties"
11
+ Dense Rank,"Rank without gaps","DENSE_RANK() OVER (ORDER BY value DESC)","Same","Same","No gaps - consecutive numbers"
12
+ Row Number,"Unique row identifier","ROW_NUMBER() OVER (ORDER BY date)","Same","Same","Good for pagination"
13
+ Percent Rank,"Percentile position","PERCENT_RANK() OVER (ORDER BY value)","Same","Same","Returns 0-1 scale"
14
+ NTILE Buckets,"Divide into N equal groups","NTILE(4) OVER (ORDER BY value)","Same","Same","Useful for quartile analysis"
15
+ First Value in Group,"Get first value per partition","FIRST_VALUE(value) OVER (PARTITION BY group ORDER BY date)","Same","Same","Useful for cohort first action"
16
+ Last Value in Group,"Get last value per partition","LAST_VALUE(value) OVER (PARTITION BY group ORDER BY date ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)","Same","Same","Must specify frame for last value"
17
+ Deduplication,"Get latest record per entity","WITH ranked AS (SELECT *, ROW_NUMBER() OVER (PARTITION BY id ORDER BY updated_at DESC) as rn FROM table) SELECT * FROM ranked WHERE rn = 1","Same","Use QUALIFY instead: SELECT * FROM table QUALIFY ROW_NUMBER() OVER (PARTITION BY id ORDER BY updated_at DESC) = 1","Index on partition and order columns"
18
+ Gap Fill Dates,"Fill missing dates in time series","Use generate_series to create date spine then LEFT JOIN","generate_series(start_date, end_date, '1 day'::interval)","GENERATE_DATE_ARRAY(start_date, end_date)","Generate date spine first, then join data"
19
+ Cohort Definition,"Assign users to signup cohort","SELECT user_id, DATE_TRUNC('month', MIN(signup_date)) OVER (PARTITION BY user_id) as cohort FROM events","Same","DATE_TRUNC(signup_date, MONTH)","Calculate cohort once and store"
20
+ Retention Cohort,"Calculate retention by cohort","WITH cohorts AS (...), activity AS (...) SELECT cohort, DATEDIFF(activity_month, cohort) as period, COUNT(DISTINCT user_id)","Same; use date_part('month', age(...))","DATE_DIFF(activity_date, cohort_date, MONTH)","Pre-compute user cohorts for efficiency"
21
+ Funnel Sequential,"Ensure funnel steps happen in order","WITH step1 AS (...), step2 AS (... WHERE step2_time > step1_time) SELECT ...","Same","Same","Index on user_id and timestamp"
22
+ Funnel Conversion,"Count users at each funnel step","SELECT 'Step1' as step, COUNT(DISTINCT user_id) UNION ALL SELECT 'Step2', COUNT(DISTINCT CASE WHEN completed_step2 THEN user_id END)","Same","Same","One pass aggregation is efficient"
23
+ Sessionization,"Group events into sessions by gap","SUM(CASE WHEN time_since_last > 30 THEN 1 ELSE 0 END) OVER (PARTITION BY user ORDER BY timestamp) as session_id","Same","Same","30 minute gap is common default"
24
+ Pivot Dynamic,"Pivot rows to columns dynamically","Use CASE WHEN for known values or crosstab() extension","crosstab() function from tablefunc","PIVOT operator available","Static CASE WHEN is more portable"
25
+ Unpivot,"Convert columns to rows","Use UNION ALL for each column or UNPIVOT keyword","UNNEST with ARRAY","UNPIVOT operator","UNION ALL works everywhere"
26
+ Self Join for Pairs,"Find related records","SELECT a.*, b.* FROM table a JOIN table b ON a.user_id = b.user_id AND a.id < b.id","Same","Same","Use a.id < b.id to avoid duplicates"
27
+ Recursive CTE,"Hierarchical data traversal","WITH RECURSIVE cte AS (base UNION ALL recursive) SELECT * FROM cte","Same","Does not support - use CONNECT BY or flatten","Limit recursion depth for safety"
28
+ Anti Join,"Find records NOT in another table","SELECT * FROM a WHERE NOT EXISTS (SELECT 1 FROM b WHERE a.id = b.id)","Same; also LEFT JOIN WHERE b.id IS NULL","Same","NOT EXISTS often most efficient"
29
+ Conditional Aggregation,"Aggregate with conditions","SUM(CASE WHEN status = 'active' THEN amount ELSE 0 END)","Same; also FILTER clause: SUM(amount) FILTER (WHERE status = 'active')","COUNTIF, SUMIF available","CASE WHEN is most portable"
30
+ Distinct Count Per Group,"Count distinct within groups","COUNT(DISTINCT user_id) OVER (PARTITION BY category)","Same","Same; also APPROX_COUNT_DISTINCT for estimates","Expensive - consider HyperLogLog"
31
+ Median Calculation,"Find median value","PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY value)","Same","APPROX_QUANTILES(value, 100)[OFFSET(50)]","Exact median is expensive; approximate is faster"
32
+ Mode Calculation,"Find most frequent value","SELECT value, COUNT(*) as cnt FROM table GROUP BY value ORDER BY cnt DESC LIMIT 1","Also: mode() WITHIN GROUP (ORDER BY value)","APPROX_TOP_COUNT for approximate","Order by count descending, limit 1"
33
+ Time Bucket,"Group timestamps into buckets","DATE_TRUNC('hour', timestamp)","date_trunc('hour', ts)","TIMESTAMP_TRUNC(ts, HOUR)","Reduces granularity for aggregation"
34
+ Date Spine Join,"Ensure all dates present","SELECT d.date, COALESCE(t.value, 0) FROM date_spine d LEFT JOIN table t ON d.date = t.date","generate_series for date spine","GENERATE_DATE_ARRAY","Create date dimension table"
35
+ Weighted Average,"Calculate weighted average","SUM(value * weight) / NULLIF(SUM(weight), 0)","Same","Same","Handle zero weight with NULLIF"
36
+ Compound Growth Rate,"Calculate CAGR","POWER(end_value / start_value, 1.0 / years) - 1","Same; use POWER() function","POWER() function","Need start, end, and period count"
@@ -0,0 +1,21 @@
1
+ Mistake Type,Description,Symptoms,Prevention Query,User Question
2
+ Duplicate Inflation,Counting same record multiple times due to duplicates or join multiplication,"Total much higher than expected; sum doesn't match source","SELECT id, COUNT(*) as cnt FROM table GROUP BY id HAVING COUNT(*) > 1","Does the total of X seem reasonable compared to other reports?"
3
+ Wrong Granularity,Aggregating at wrong level (user vs session vs event),"Numbers don't match other reports; unexpected row counts","SELECT COUNT(*), COUNT(DISTINCT user_id), COUNT(DISTINCT session_id) FROM table","Is this data one row per user, per session, or per event?"
4
+ Missing Filter,Forgot to exclude test users, internal accounts, or invalid data,"Numbers include test data; higher than expected","SELECT COUNT(*) FROM users WHERE email LIKE '%@company.com' OR email LIKE '%test%'","Should we exclude internal/test users? Any known filters?"
5
+ Timezone Mismatch,Comparing dates in different timezones causing misalignment,"Day totals don't match other reports; off-by-one errors","SELECT DISTINCT date_trunc('day', ts AT TIME ZONE 'UTC') vs AT TIME ZONE 'PST'","What timezone should I use for date calculations?"
6
+ Survivorship Bias,Only analyzing users who completed journey ignoring dropoffs,"Metrics look too good; missing failed attempts","Check: are we only looking at users who converted?","Are we analyzing all users or only those who [completed action]?"
7
+ Simpson's Paradox,Aggregate trend opposite of subgroup trends,"Conflicting conclusions; unexpected direction","Compare aggregate vs segment-level trends","Should we break this down by [segment] to check for hidden patterns?"
8
+ Incomplete Time Period,Comparing full period to partial period,"Latest period looks lower than historical","Check if latest period has full data: WHERE date < current_date","Is the latest time period complete, or should we exclude it?"
9
+ Wrong Join Type,Using INNER when LEFT needed or vice versa,"Missing rows; unexpected nulls; row count changes","Compare row counts before and after join","The join produced X rows from Y original rows. Does this match expectation?"
10
+ Null Handling Errors,NULLs excluded from aggregations unexpectedly,"Lower counts than expected; divisions by zero","SELECT COUNT(*), COUNT(column), SUM(CASE WHEN column IS NULL THEN 1 END)","How should we handle missing/null values in this analysis?"
11
+ Off-by-One Date Errors,BETWEEN includes endpoints; wrong date boundary,"One extra or missing day; period mismatch","Check: date >= start AND date < end (exclusive end)","Should the date range include or exclude the end date?"
12
+ Metric Definition Mismatch,Using different definition than stakeholder expects,"Numbers don't match expectations; confusion","Document exact definition before starting","How does your team define [metric]? What's included/excluded?"
13
+ Currency Unit Confusion,Mixing dollars and cents or different currencies,"Numbers off by factor of 100 or exchange rate","Check: are amounts in dollars or cents? One currency?","Are these amounts in dollars or cents? Same currency throughout?"
14
+ Seasonality Ignored,Comparing periods with different seasonal patterns,"Invalid conclusions; unfair comparisons","Compare same period last year, not sequential periods","Should we compare to same period last year to account for seasonality?"
15
+ Selection Bias,Analyzing non-representative sample,"Conclusions don't generalize; biased insights","Check how sample was selected; compare to population","Is this sample representative of all users, or a specific subset?"
16
+ Correlation vs Causation,Claiming causation from correlation,"Incorrect business recommendations","Check: is there a plausible mechanism? Control for confounders?","Does X actually cause Y, or are they just correlated?"
17
+ Cherry Picking Dates,Choosing date range that shows desired narrative,"Misleading conclusions; not reproducible","Use standard reporting periods; document why dates chosen","Why was this specific date range chosen?"
18
+ Aggregation Level Mismatch,Comparing metrics calculated at different levels,"Apples to oranges comparison; invalid conclusions","Ensure both metrics use same denominator/level","Are both these metrics calculated the same way (same level)?"
19
+ Data Latency Issues,Using stale data that hasn't propagated fully,"Recent periods look incomplete; inconsistent","Check data freshness: MAX(updated_at), pipeline completion","Is this data fully loaded? When was it last updated?"
20
+ Calculation Errors,Wrong formula for complex metrics,"Metrics don't match known correct values","Validate against known correct calculation or source","Can we validate this against another source or manual calculation?"
21
+ Presentation Bias,Chart design exaggerating or hiding patterns,"Misleading visualizations; wrong conclusions","Check: y-axis starts at zero? Scale appropriate?","Does this chart accurately represent the data without distortion?"
@@ -1,3 +1,8 @@
1
+ ---
2
+ name: data-analyst
3
+ description: A data analyst workflow for structured, professional data analysis. Use when user requests data analysis, dashboards, metrics, EDA, cohort, funnel, or A/B tests.
4
+ ---
5
+
1
6
  # CrushData AI - Data Analyst Workflow
2
7
 
3
8
  A data analyst intelligence workflow for structured, professional data analysis.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "crushdataai",
3
- "version": "1.2.8",
3
+ "version": "1.2.10",
4
4
  "description": "CLI to install CrushData AI data analyst skill for AI coding assistants",
5
5
  "main": "dist/index.js",
6
6
  "bin": {