@wentorai/research-plugins 1.0.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (203) hide show
  1. package/README.md +22 -22
  2. package/curated/analysis/README.md +71 -56
  3. package/curated/domains/README.md +176 -67
  4. package/curated/literature/README.md +71 -47
  5. package/curated/research/README.md +91 -58
  6. package/curated/tools/README.md +88 -87
  7. package/curated/writing/README.md +80 -45
  8. package/mcp-configs/cloud-docs/confluence-mcp.json +37 -0
  9. package/mcp-configs/cloud-docs/google-drive-mcp.json +35 -0
  10. package/mcp-configs/cloud-docs/notion-mcp.json +29 -0
  11. package/mcp-configs/communication/discord-mcp.json +29 -0
  12. package/mcp-configs/communication/slack-mcp.json +29 -0
  13. package/mcp-configs/communication/telegram-mcp.json +28 -0
  14. package/mcp-configs/database/neo4j-mcp.json +37 -0
  15. package/mcp-configs/database/postgres-mcp.json +28 -0
  16. package/mcp-configs/database/sqlite-mcp.json +29 -0
  17. package/mcp-configs/dev-platform/github-mcp.json +31 -0
  18. package/mcp-configs/dev-platform/gitlab-mcp.json +34 -0
  19. package/mcp-configs/email/email-mcp.json +40 -0
  20. package/mcp-configs/email/gmail-mcp.json +37 -0
  21. package/mcp-configs/registry.json +178 -149
  22. package/mcp-configs/repository/dataverse-mcp.json +33 -0
  23. package/mcp-configs/repository/huggingface-mcp.json +29 -0
  24. package/openclaw.plugin.json +2 -2
  25. package/package.json +2 -2
  26. package/skills/analysis/dataviz/algorithm-visualizer-guide/SKILL.md +259 -0
  27. package/skills/analysis/dataviz/bokeh-visualization-guide/SKILL.md +270 -0
  28. package/skills/analysis/dataviz/chart-image-generator/SKILL.md +229 -0
  29. package/skills/analysis/dataviz/d3-visualization-guide/SKILL.md +281 -0
  30. package/skills/analysis/dataviz/echarts-visualization-guide/SKILL.md +250 -0
  31. package/skills/analysis/dataviz/metabase-analytics-guide/SKILL.md +242 -0
  32. package/skills/analysis/dataviz/plotly-interactive-guide/SKILL.md +266 -0
  33. package/skills/analysis/dataviz/redash-analytics-guide/SKILL.md +284 -0
  34. package/skills/analysis/econometrics/econml-causal-guide/SKILL.md +163 -0
  35. package/skills/analysis/econometrics/mostly-harmless-guide/SKILL.md +139 -0
  36. package/skills/analysis/econometrics/panel-data-analyst/SKILL.md +259 -0
  37. package/skills/analysis/econometrics/python-causality-guide/SKILL.md +134 -0
  38. package/skills/analysis/econometrics/stata-accounting-guide/SKILL.md +269 -0
  39. package/skills/analysis/econometrics/stata-analyst-guide/SKILL.md +245 -0
  40. package/skills/analysis/statistics/data-anomaly-detection/SKILL.md +157 -0
  41. package/skills/analysis/statistics/ml-experiment-tracker/SKILL.md +212 -0
  42. package/skills/analysis/statistics/pywayne-statistics-guide/SKILL.md +192 -0
  43. package/skills/analysis/statistics/quantitative-methods-guide/SKILL.md +193 -0
  44. package/skills/analysis/statistics/senior-data-scientist-guide/SKILL.md +223 -0
  45. package/skills/analysis/wrangling/csv-data-analyzer/SKILL.md +170 -0
  46. package/skills/analysis/wrangling/data-cleaning-pipeline/SKILL.md +266 -0
  47. package/skills/analysis/wrangling/data-cog-guide/SKILL.md +178 -0
  48. package/skills/analysis/wrangling/stata-data-cleaning/SKILL.md +276 -0
  49. package/skills/analysis/wrangling/survey-data-processing/SKILL.md +298 -0
  50. package/skills/domains/ai-ml/ai-model-benchmarking/SKILL.md +209 -0
  51. package/skills/domains/ai-ml/annotated-dl-papers-guide/SKILL.md +159 -0
  52. package/skills/domains/ai-ml/dl-transformer-finetune/SKILL.md +239 -0
  53. package/skills/domains/ai-ml/generative-ai-guide/SKILL.md +146 -0
  54. package/skills/domains/ai-ml/huggingface-inference-guide/SKILL.md +196 -0
  55. package/skills/domains/ai-ml/keras-deep-learning/SKILL.md +210 -0
  56. package/skills/domains/ai-ml/llm-from-scratch-guide/SKILL.md +124 -0
  57. package/skills/domains/ai-ml/ml-pipeline-guide/SKILL.md +295 -0
  58. package/skills/domains/ai-ml/nlp-toolkit-guide/SKILL.md +247 -0
  59. package/skills/domains/ai-ml/pytorch-guide/SKILL.md +281 -0
  60. package/skills/domains/ai-ml/pytorch-lightning-guide/SKILL.md +244 -0
  61. package/skills/domains/ai-ml/tensorflow-guide/SKILL.md +241 -0
  62. package/skills/domains/biomedical/bioagents-guide/SKILL.md +308 -0
  63. package/skills/domains/biomedical/medgeclaw-guide/SKILL.md +345 -0
  64. package/skills/domains/biomedical/medical-imaging-guide/SKILL.md +305 -0
  65. package/skills/domains/business/architecture-design-guide/SKILL.md +279 -0
  66. package/skills/domains/business/innovation-management-guide/SKILL.md +257 -0
  67. package/skills/domains/business/operations-research-guide/SKILL.md +258 -0
  68. package/skills/domains/chemistry/molecular-dynamics-guide/SKILL.md +237 -0
  69. package/skills/domains/chemistry/pubchem-api-guide/SKILL.md +180 -0
  70. package/skills/domains/chemistry/spectroscopy-analysis-guide/SKILL.md +290 -0
  71. package/skills/domains/cs/distributed-systems-guide/SKILL.md +268 -0
  72. package/skills/domains/cs/formal-verification-guide/SKILL.md +298 -0
  73. package/skills/domains/ecology/species-distribution-guide/SKILL.md +343 -0
  74. package/skills/domains/economics/imf-data-api-guide/SKILL.md +174 -0
  75. package/skills/domains/economics/post-labor-economics/SKILL.md +254 -0
  76. package/skills/domains/economics/pricing-psychology-guide/SKILL.md +273 -0
  77. package/skills/domains/economics/world-bank-data-guide/SKILL.md +179 -0
  78. package/skills/domains/education/assessment-design-guide/SKILL.md +213 -0
  79. package/skills/domains/education/educational-research-methods/SKILL.md +179 -0
  80. package/skills/domains/education/mooc-analytics-guide/SKILL.md +206 -0
  81. package/skills/domains/finance/portfolio-optimization-guide/SKILL.md +279 -0
  82. package/skills/domains/finance/risk-modeling-guide/SKILL.md +260 -0
  83. package/skills/domains/finance/stata-accounting-research/SKILL.md +372 -0
  84. package/skills/domains/geoscience/climate-modeling-guide/SKILL.md +215 -0
  85. package/skills/domains/geoscience/satellite-remote-sensing/SKILL.md +193 -0
  86. package/skills/domains/geoscience/seismology-data-guide/SKILL.md +208 -0
  87. package/skills/domains/humanities/ethical-philosophy-guide/SKILL.md +244 -0
  88. package/skills/domains/humanities/history-research-guide/SKILL.md +260 -0
  89. package/skills/domains/humanities/political-history-guide/SKILL.md +241 -0
  90. package/skills/domains/law/legal-nlp-guide/SKILL.md +236 -0
  91. package/skills/domains/law/patent-analysis-guide/SKILL.md +257 -0
  92. package/skills/domains/law/regulatory-compliance-guide/SKILL.md +267 -0
  93. package/skills/domains/math/symbolic-computation-guide/SKILL.md +263 -0
  94. package/skills/domains/math/topology-data-analysis/SKILL.md +305 -0
  95. package/skills/domains/pharma/clinical-trial-design-guide/SKILL.md +271 -0
  96. package/skills/domains/pharma/drug-target-interaction/SKILL.md +242 -0
  97. package/skills/domains/pharma/pharmacovigilance-guide/SKILL.md +216 -0
  98. package/skills/domains/physics/astrophysics-data-guide/SKILL.md +305 -0
  99. package/skills/domains/physics/particle-physics-guide/SKILL.md +287 -0
  100. package/skills/domains/social-science/network-analysis-guide/SKILL.md +310 -0
  101. package/skills/domains/social-science/psychology-research-guide/SKILL.md +270 -0
  102. package/skills/domains/social-science/sociology-research-guide/SKILL.md +238 -0
  103. package/skills/literature/discovery/paper-recommendation-guide/SKILL.md +120 -0
  104. package/skills/literature/discovery/semantic-paper-radar/SKILL.md +144 -0
  105. package/skills/literature/discovery/zotero-arxiv-daily-guide/SKILL.md +94 -0
  106. package/skills/literature/fulltext/core-api-guide/SKILL.md +144 -0
  107. package/skills/literature/fulltext/institutional-repository-guide/SKILL.md +212 -0
  108. package/skills/literature/fulltext/open-access-mining-guide/SKILL.md +341 -0
  109. package/skills/literature/metadata/academic-paper-summarizer/SKILL.md +101 -0
  110. package/skills/literature/metadata/wikidata-api-guide/SKILL.md +156 -0
  111. package/skills/literature/search/arxiv-batch-reporting/SKILL.md +133 -0
  112. package/skills/literature/search/arxiv-paper-processor/SKILL.md +141 -0
  113. package/skills/literature/search/baidu-scholar-guide/SKILL.md +110 -0
  114. package/skills/literature/search/chatpaper-guide/SKILL.md +122 -0
  115. package/skills/literature/search/deep-literature-search/SKILL.md +149 -0
  116. package/skills/literature/search/deepgit-search-guide/SKILL.md +147 -0
  117. package/skills/literature/search/pasa-paper-search-guide/SKILL.md +138 -0
  118. package/skills/research/automation/ai-scientist-v2-guide/SKILL.md +284 -0
  119. package/skills/research/automation/aim-experiment-guide/SKILL.md +234 -0
  120. package/skills/research/automation/datagen-research-guide/SKILL.md +131 -0
  121. package/skills/research/automation/kedro-pipeline-guide/SKILL.md +216 -0
  122. package/skills/research/automation/mle-agent-guide/SKILL.md +139 -0
  123. package/skills/research/automation/paper-to-agent-guide/SKILL.md +116 -0
  124. package/skills/research/automation/rd-agent-guide/SKILL.md +246 -0
  125. package/skills/research/automation/research-paper-orchestrator/SKILL.md +254 -0
  126. package/skills/research/deep-research/academic-deep-research/SKILL.md +190 -0
  127. package/skills/research/deep-research/auto-deep-research-guide/SKILL.md +141 -0
  128. package/skills/research/deep-research/deep-research-pro/SKILL.md +213 -0
  129. package/skills/research/deep-research/deep-research-work/SKILL.md +204 -0
  130. package/skills/research/deep-research/deep-searcher-guide/SKILL.md +253 -0
  131. package/skills/research/deep-research/gpt-researcher-guide/SKILL.md +191 -0
  132. package/skills/research/deep-research/khoj-research-guide/SKILL.md +200 -0
  133. package/skills/research/deep-research/local-deep-research-guide/SKILL.md +253 -0
  134. package/skills/research/deep-research/tongyi-deep-research-guide/SKILL.md +217 -0
  135. package/skills/research/funding/eu-horizon-guide/SKILL.md +244 -0
  136. package/skills/research/funding/grant-budget-guide/SKILL.md +284 -0
  137. package/skills/research/funding/nih-reporter-api-guide/SKILL.md +166 -0
  138. package/skills/research/funding/nsf-award-api-guide/SKILL.md +133 -0
  139. package/skills/research/methodology/academic-mentor-guide/SKILL.md +169 -0
  140. package/skills/research/methodology/claude-scientific-guide/SKILL.md +122 -0
  141. package/skills/research/methodology/deep-innovator-guide/SKILL.md +242 -0
  142. package/skills/research/methodology/osf-api-guide/SKILL.md +165 -0
  143. package/skills/research/methodology/research-paper-kb/SKILL.md +263 -0
  144. package/skills/research/methodology/research-town-guide/SKILL.md +263 -0
  145. package/skills/research/paper-review/automated-review-guide/SKILL.md +281 -0
  146. package/skills/research/paper-review/paper-compare-guide/SKILL.md +238 -0
  147. package/skills/research/paper-review/paper-digest-guide/SKILL.md +240 -0
  148. package/skills/research/paper-review/paper-research-assistant/SKILL.md +231 -0
  149. package/skills/research/paper-review/research-quality-filter/SKILL.md +261 -0
  150. package/skills/research/paper-review/review-response-guide/SKILL.md +275 -0
  151. package/skills/tools/code-exec/google-colab-guide/SKILL.md +276 -0
  152. package/skills/tools/code-exec/kaggle-api-guide/SKILL.md +216 -0
  153. package/skills/tools/code-exec/overleaf-cli-guide/SKILL.md +279 -0
  154. package/skills/tools/diagram/code-flow-visualizer/SKILL.md +197 -0
  155. package/skills/tools/diagram/excalidraw-diagram-guide/SKILL.md +170 -0
  156. package/skills/tools/diagram/json-data-visualizer/SKILL.md +270 -0
  157. package/skills/tools/diagram/mermaid-architect-guide/SKILL.md +219 -0
  158. package/skills/tools/diagram/tldraw-whiteboard-guide/SKILL.md +397 -0
  159. package/skills/tools/document/docsgpt-guide/SKILL.md +130 -0
  160. package/skills/tools/document/large-document-reader/SKILL.md +202 -0
  161. package/skills/tools/document/paper-parse-guide/SKILL.md +243 -0
  162. package/skills/tools/knowledge-graph/citation-network-builder/SKILL.md +244 -0
  163. package/skills/tools/knowledge-graph/concept-map-generator/SKILL.md +284 -0
  164. package/skills/tools/knowledge-graph/graphiti-guide/SKILL.md +219 -0
  165. package/skills/tools/ocr-translate/pdf-math-translate-guide/SKILL.md +141 -0
  166. package/skills/tools/ocr-translate/zotero-pdf-translate-guide/SKILL.md +95 -0
  167. package/skills/tools/ocr-translate/zotero-pdf2zh-guide/SKILL.md +143 -0
  168. package/skills/tools/scraping/dataset-finder-guide/SKILL.md +253 -0
  169. package/skills/tools/scraping/easy-spider-guide/SKILL.md +250 -0
  170. package/skills/tools/scraping/google-scholar-scraper/SKILL.md +255 -0
  171. package/skills/tools/scraping/repository-harvesting-guide/SKILL.md +310 -0
  172. package/skills/writing/citation/academic-citation-manager/SKILL.md +314 -0
  173. package/skills/writing/citation/jabref-reference-guide/SKILL.md +127 -0
  174. package/skills/writing/citation/jasminum-zotero-guide/SKILL.md +103 -0
  175. package/skills/writing/citation/obsidian-citation-guide/SKILL.md +164 -0
  176. package/skills/writing/citation/obsidian-zotero-guide/SKILL.md +137 -0
  177. package/skills/writing/citation/papersgpt-zotero-guide/SKILL.md +132 -0
  178. package/skills/writing/citation/papis-cli-guide/SKILL.md +213 -0
  179. package/skills/writing/citation/zotero-better-bibtex-guide/SKILL.md +107 -0
  180. package/skills/writing/citation/zotero-better-notes-guide/SKILL.md +121 -0
  181. package/skills/writing/citation/zotero-gpt-guide/SKILL.md +111 -0
  182. package/skills/writing/citation/zotero-mcp-guide/SKILL.md +164 -0
  183. package/skills/writing/citation/zotero-mdnotes-guide/SKILL.md +162 -0
  184. package/skills/writing/citation/zotero-reference-guide/SKILL.md +139 -0
  185. package/skills/writing/citation/zotero-scholar-guide/SKILL.md +294 -0
  186. package/skills/writing/citation/zotfile-attachment-guide/SKILL.md +140 -0
  187. package/skills/writing/composition/ml-paper-writing/SKILL.md +163 -0
  188. package/skills/writing/composition/paper-debugger-guide/SKILL.md +143 -0
  189. package/skills/writing/composition/scientific-writing-resources/SKILL.md +151 -0
  190. package/skills/writing/composition/scientific-writing-wrapper/SKILL.md +153 -0
  191. package/skills/writing/latex/latex-drawing-collection/SKILL.md +154 -0
  192. package/skills/writing/latex/latex-templates-collection/SKILL.md +159 -0
  193. package/skills/writing/latex/md-to-pdf-academic/SKILL.md +230 -0
  194. package/skills/writing/latex/tex-render-guide/SKILL.md +243 -0
  195. package/skills/writing/polish/academic-tone-guide/SKILL.md +209 -0
  196. package/skills/writing/polish/conciseness-editing-guide/SKILL.md +225 -0
  197. package/skills/writing/polish/paper-polish-guide/SKILL.md +160 -0
  198. package/skills/writing/templates/graphical-abstract-guide/SKILL.md +183 -0
  199. package/skills/writing/templates/novathesis-guide/SKILL.md +152 -0
  200. package/skills/writing/templates/scientific-article-pdf/SKILL.md +261 -0
  201. package/skills/writing/templates/sjtuthesis-guide/SKILL.md +197 -0
  202. package/skills/writing/templates/thuthesis-guide/SKILL.md +181 -0
  203. package/skills/literature/fulltext/repository-harvesting-guide/SKILL.md +0 -207
@@ -0,0 +1,295 @@
1
+ ---
2
+ name: ml-pipeline-guide
3
+ description: "Build and deploy reproducible production ML pipelines for research"
4
+ metadata:
5
+ openclaw:
6
+ emoji: "🔧"
7
+ category: "domains"
8
+ subcategory: "ai-ml"
9
+ keywords: ["MLOps", "pipeline", "deployment", "reproducibility", "feature engineering", "CI/CD"]
10
+ source: "https://github.com/mlflow/mlflow"
11
+ ---
12
+
13
+ # ML Pipeline Guide
14
+
15
+ ## Overview
16
+
17
+ Machine learning research increasingly demands reproducible, end-to-end pipelines that go beyond a single training script. A research ML pipeline encompasses data ingestion, feature engineering, model training, evaluation, experiment tracking, and artifact management. Without a structured pipeline, research results become difficult to reproduce, ablation studies become error-prone, and collaborators cannot build on prior work.
18
+
19
+ This guide covers the practical tools and patterns for building ML pipelines in an academic research context. The focus is on reproducibility, experiment tracking, and the transition from notebook prototyping to structured experiments. The patterns use MLflow, DVC, and standard Python tooling -- chosen because they are open source, widely adopted in published research, and require minimal infrastructure.
20
+
21
+ Unlike industry MLOps guides that emphasize deployment at scale, this guide prioritizes the research workflow: running many experiments, tracking what changed between runs, and producing results that reviewers can verify.
22
+
23
+ ## Pipeline Architecture
24
+
25
+ A research ML pipeline typically has five stages:
26
+
27
+ ```
28
+ Data Ingestion → Feature Engineering → Training → Evaluation → Artifact Storage
29
+ │ │ │ │ │
30
+ ├── raw data ├── transforms ├── model ├── metrics ├── models
31
+ ├── splits ├── features ├── logs ├── plots ├── configs
32
+ └── metadata └── cache └── ckpts └── tables └── reports
33
+ ```
34
+
35
+ ### Directory Structure for Reproducible Research
36
+
37
+ ```
38
+ project/
39
+ ├── configs/
40
+ │ ├── base.yaml # Default hyperparameters
41
+ │ ├── experiment_001.yaml # Experiment-specific overrides
42
+ │ └── sweep.yaml # Hyperparameter search space
43
+ ├── data/
44
+ │ ├── raw/ # Immutable original data
45
+ │ ├── processed/ # Cleaned and transformed
46
+ │ └── splits/ # Train/val/test splits (versioned)
47
+ ├── src/
48
+ │ ├── data/ # Data loading and preprocessing
49
+ │ ├── features/ # Feature engineering
50
+ │ ├── models/ # Model definitions
51
+ │ ├── training/ # Training loops
52
+ │ └── evaluation/ # Metrics and visualization
53
+ ├── experiments/ # MLflow/W&B experiment logs
54
+ ├── notebooks/ # Exploratory analysis only
55
+ ├── tests/ # Unit tests for pipeline components
56
+ ├── Makefile # Reproducible commands
57
+ ├── requirements.txt # Pinned dependencies
58
+ └── dvc.yaml # Data version control pipeline
59
+ ```
60
+
61
+ ## Experiment Tracking with MLflow
62
+
63
+ ```python
64
+ import mlflow
65
+ import mlflow.pytorch
66
+ from pathlib import Path
67
+
68
+ def run_experiment(config: dict):
69
+ """Run a single experiment with full tracking."""
70
+ mlflow.set_experiment(config["experiment_name"])
71
+
72
+ with mlflow.start_run(run_name=config.get("run_name")):
73
+ # Log configuration
74
+ mlflow.log_params({
75
+ "model": config["model_name"],
76
+ "learning_rate": config["lr"],
77
+ "batch_size": config["batch_size"],
78
+ "epochs": config["epochs"],
79
+ "optimizer": config["optimizer"],
80
+ "seed": config["seed"],
81
+ })
82
+
83
+ # Log environment
84
+ mlflow.log_param("python_version", sys.version)
85
+ mlflow.log_param("torch_version", torch.__version__)
86
+ mlflow.log_param("cuda_version", torch.version.cuda)
87
+
88
+ # Training
89
+ model = build_model(config)
90
+ for epoch in range(config["epochs"]):
91
+ train_loss = train_one_epoch(model, train_loader, optimizer)
92
+ val_loss, val_metrics = evaluate(model, val_loader)
93
+
94
+ mlflow.log_metrics({
95
+ "train_loss": train_loss,
96
+ "val_loss": val_loss,
97
+ **{f"val_{k}": v for k, v in val_metrics.items()},
98
+ }, step=epoch)
99
+
100
+ # Log final model
101
+ mlflow.pytorch.log_model(model, "model")
102
+
103
+ # Log artifacts (plots, configs)
104
+ mlflow.log_artifact(config_path)
105
+ save_evaluation_plots(model, test_loader, "plots/")
106
+ mlflow.log_artifacts("plots/")
107
+
108
+ return val_metrics
109
+ ```
110
+
111
+ ## Data Versioning with DVC
112
+
113
+ ```yaml
114
+ # dvc.yaml -- Pipeline definition
115
+ stages:
116
+ prepare_data:
117
+ cmd: python src/data/prepare.py --config configs/base.yaml
118
+ deps:
119
+ - src/data/prepare.py
120
+ - data/raw/
121
+ outs:
122
+ - data/processed/
123
+ params:
124
+ - configs/base.yaml:
125
+ - data.split_ratio
126
+ - data.random_seed
127
+
128
+ extract_features:
129
+ cmd: python src/features/extract.py --config configs/base.yaml
130
+ deps:
131
+ - src/features/extract.py
132
+ - data/processed/
133
+ outs:
134
+ - data/features/
135
+ params:
136
+ - configs/base.yaml:
137
+ - features
138
+
139
+ train:
140
+ cmd: python src/training/train.py --config configs/base.yaml
141
+ deps:
142
+ - src/training/train.py
143
+ - src/models/
144
+ - data/features/
145
+ outs:
146
+ - models/
147
+ metrics:
148
+ - metrics.json:
149
+ cache: false
150
+ plots:
151
+ - plots/training_curve.csv:
152
+ x: epoch
153
+ y: loss
154
+ ```
155
+
156
+ ```bash
157
+ # Reproduce the full pipeline
158
+ dvc repro
159
+
160
+ # Compare experiments
161
+ dvc metrics diff
162
+
163
+ # Push data to remote storage
164
+ dvc push
165
+ ```
166
+
167
+ ## Configuration Management with Hydra
168
+
169
+ ```python
170
+ import hydra
171
+ from omegaconf import DictConfig, OmegaConf
172
+
173
+ @hydra.main(config_path="configs", config_name="base", version_base=None)
174
+ def main(cfg: DictConfig):
175
+ print(OmegaConf.to_yaml(cfg))
176
+
177
+ model = build_model(
178
+ name=cfg.model.name,
179
+ hidden_dim=cfg.model.hidden_dim,
180
+ num_layers=cfg.model.num_layers,
181
+ )
182
+
183
+ train(
184
+ model=model,
185
+ lr=cfg.training.lr,
186
+ epochs=cfg.training.epochs,
187
+ batch_size=cfg.training.batch_size,
188
+ )
189
+
190
+ # Override from command line:
191
+ # python train.py training.lr=1e-4 model.hidden_dim=512
192
+ # python train.py --multirun training.lr=1e-3,1e-4,1e-5
193
+ ```
194
+
195
+ ```yaml
196
+ # configs/base.yaml
197
+ model:
198
+ name: resnet50
199
+ hidden_dim: 256
200
+ num_layers: 4
201
+
202
+ training:
203
+ lr: 1e-3
204
+ epochs: 100
205
+ batch_size: 32
206
+ optimizer: adamw
207
+ weight_decay: 0.01
208
+
209
+ data:
210
+ dataset: cifar10
211
+ split_ratio: [0.8, 0.1, 0.1]
212
+ random_seed: 42
213
+ augmentation: true
214
+ ```
215
+
216
+ ## Feature Engineering Patterns
217
+
218
+ ```python
219
+ from sklearn.pipeline import Pipeline
220
+ from sklearn.preprocessing import StandardScaler, OneHotEncoder
221
+ from sklearn.compose import ColumnTransformer
222
+ from sklearn.impute import SimpleImputer
223
+ import joblib
224
+
225
+ def build_feature_pipeline(numeric_cols: list, categorical_cols: list) -> Pipeline:
226
+ """Build a reproducible feature engineering pipeline."""
227
+ numeric_transformer = Pipeline([
228
+ ("imputer", SimpleImputer(strategy="median")),
229
+ ("scaler", StandardScaler()),
230
+ ])
231
+
232
+ categorical_transformer = Pipeline([
233
+ ("imputer", SimpleImputer(strategy="most_frequent")),
234
+ ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
235
+ ])
236
+
237
+ preprocessor = ColumnTransformer([
238
+ ("num", numeric_transformer, numeric_cols),
239
+ ("cat", categorical_transformer, categorical_cols),
240
+ ])
241
+
242
+ return preprocessor
243
+
244
+ # Save and load for reproducibility
245
+ preprocessor.fit(X_train)
246
+ joblib.dump(preprocessor, "artifacts/preprocessor.pkl")
247
+ # Later: preprocessor = joblib.load("artifacts/preprocessor.pkl")
248
+ ```
249
+
250
+ ## Makefile for Reproducibility
251
+
252
+ ```makefile
253
+ .PHONY: setup data train evaluate all clean
254
+
255
+ setup:
256
+ pip install -r requirements.txt
257
+ dvc pull
258
+
259
+ data:
260
+ python src/data/prepare.py --config configs/base.yaml
261
+
262
+ train:
263
+ python src/training/train.py --config configs/base.yaml
264
+
265
+ evaluate:
266
+ python src/evaluation/evaluate.py --config configs/base.yaml
267
+
268
+ all: setup data train evaluate
269
+
270
+ sweep:
271
+ python src/training/train.py --multirun \
272
+ training.lr=1e-3,1e-4,1e-5 \
273
+ model.hidden_dim=128,256,512
274
+
275
+ clean:
276
+ rm -rf outputs/ multirun/ __pycache__/
277
+ ```
278
+
279
+ ## Best Practices
280
+
281
+ - **Never modify raw data.** All transformations should be scripted and reproducible.
282
+ - **Pin every dependency version** including CUDA, cuDNN, and OS-level libraries.
283
+ - **Separate configuration from code.** Use YAML/JSON configs, not hardcoded values.
284
+ - **Track experiments from day one.** Retrofitting experiment tracking is painful.
285
+ - **Write tests for data preprocessing.** Shape mismatches and silent data corruption are common.
286
+ - **Use `Makefile` or `dvc repro`** so any collaborator can reproduce results with one command.
287
+ - **Version your data alongside your code** using DVC, Git-LFS, or cloud storage with manifests.
288
+
289
+ ## References
290
+
291
+ - [MLflow documentation](https://mlflow.org/docs/latest/) -- Experiment tracking and model registry
292
+ - [DVC documentation](https://dvc.org/doc) -- Data version control for ML
293
+ - [Hydra documentation](https://hydra.cc/) -- Configuration management framework
294
+ - [Cookiecutter Data Science](https://drivendata.github.io/cookiecutter-data-science/) -- Project structure template
295
+ - [Made With ML](https://madewithml.com/) -- MLOps best practices for researchers
@@ -0,0 +1,247 @@
1
+ ---
2
+ name: nlp-toolkit-guide
3
+ description: "NLP analysis with perplexity scoring, burstiness, and entropy metrics"
4
+ metadata:
5
+ openclaw:
6
+ emoji: "💬"
7
+ category: "domains"
8
+ subcategory: "ai-ml"
9
+ keywords: ["NLP", "perplexity", "burstiness", "entropy", "tokenization", "text analysis"]
10
+ source: "https://github.com/huggingface/transformers"
11
+ ---
12
+
13
+ # NLP Toolkit Guide
14
+
15
+ ## Overview
16
+
17
+ Natural Language Processing research requires a diverse set of analytical tools beyond standard model training. Text quality assessment, AI-generated text detection, linguistic feature extraction, and corpus analysis all depend on well-understood metrics: perplexity, burstiness, entropy, and their variants.
18
+
19
+ This guide provides practical implementations of these core NLP metrics alongside patterns for tokenization, embedding analysis, and text feature engineering. The focus is on metrics used in active research areas -- AI text detection (perplexity + burstiness classifiers), information-theoretic analysis of corpora, and linguistic diversity measurement.
20
+
21
+ These tools are framework-agnostic where possible, but leverage Hugging Face Transformers for language model operations and standard Python scientific computing libraries for statistical analysis.
22
+
23
+ ## Perplexity Scoring
24
+
25
+ Perplexity measures how well a language model predicts a text. Lower perplexity means the text is more predictable to the model -- a key signal in AI text detection, model evaluation, and domain adaptation.
26
+
27
+ ```python
28
+ import torch
29
+ import numpy as np
30
+ from transformers import AutoModelForCausalLM, AutoTokenizer
31
+
32
+ def compute_perplexity(text: str, model_name: str = "gpt2") -> dict:
33
+ """
34
+ Compute token-level and text-level perplexity using a causal LM.
35
+
36
+ Returns:
37
+ dict with 'perplexity', 'log_likelihood', 'token_perplexities'
38
+ """
39
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
40
+ model = AutoModelForCausalLM.from_pretrained(model_name)
41
+ model.eval()
42
+
43
+ encodings = tokenizer(text, return_tensors="pt", truncation=True, max_length=1024)
44
+ input_ids = encodings.input_ids
45
+
46
+ with torch.no_grad():
47
+ outputs = model(input_ids, labels=input_ids)
48
+ neg_log_likelihood = outputs.loss.item()
49
+
50
+ # Token-level perplexities for analysis
51
+ with torch.no_grad():
52
+ logits = outputs.logits[:, :-1, :] # Shift for next-token prediction
53
+ targets = input_ids[:, 1:]
54
+ log_probs = torch.log_softmax(logits, dim=-1)
55
+ token_log_probs = log_probs.gather(2, targets.unsqueeze(-1)).squeeze(-1)
56
+ token_perplexities = torch.exp(-token_log_probs).squeeze().tolist()
57
+
58
+ perplexity = np.exp(neg_log_likelihood)
59
+
60
+ return {
61
+ "perplexity": perplexity,
62
+ "log_likelihood": -neg_log_likelihood,
63
+ "token_perplexities": token_perplexities,
64
+ "num_tokens": input_ids.size(1),
65
+ }
66
+ ```
67
+
68
+ ## Burstiness Analysis
69
+
70
+ Burstiness measures the tendency of words to appear in clusters rather than uniformly across a text. Human writing tends to be "burstier" -- once a topic is introduced, related terms cluster together, then disappear.
71
+
72
+ ```python
73
+ from collections import Counter
74
+ import numpy as np
75
+
76
+ def compute_burstiness(text: str, min_freq: int = 2) -> dict:
77
+ """
78
+ Compute burstiness score for a text.
79
+
80
+ Burstiness B = (sigma - mu) / (sigma + mu)
81
+ where sigma and mu are the std dev and mean of inter-arrival times.
82
+ B ranges from -1 (periodic) to 1 (bursty). Human text typically B > 0.
83
+ """
84
+ words = text.lower().split()
85
+ word_positions = {}
86
+ for i, word in enumerate(words):
87
+ word_positions.setdefault(word, []).append(i)
88
+
89
+ burstiness_scores = {}
90
+ for word, positions in word_positions.items():
91
+ if len(positions) < min_freq:
92
+ continue
93
+ inter_arrivals = np.diff(positions)
94
+ mu = np.mean(inter_arrivals)
95
+ sigma = np.std(inter_arrivals)
96
+ if mu + sigma == 0:
97
+ burstiness_scores[word] = 0.0
98
+ else:
99
+ burstiness_scores[word] = (sigma - mu) / (sigma + mu)
100
+
101
+ # Aggregate burstiness
102
+ if burstiness_scores:
103
+ avg_burstiness = np.mean(list(burstiness_scores.values()))
104
+ else:
105
+ avg_burstiness = 0.0
106
+
107
+ return {
108
+ "average_burstiness": avg_burstiness,
109
+ "word_burstiness": burstiness_scores,
110
+ "num_words_analyzed": len(burstiness_scores),
111
+ }
112
+ ```
113
+
114
+ ## Entropy and Information-Theoretic Metrics
115
+
116
+ ```python
117
+ from collections import Counter
118
+ import numpy as np
119
+
120
+ def compute_entropy(text: str, level: str = "word") -> dict:
121
+ """
122
+ Compute Shannon entropy at word or character level.
123
+
124
+ Higher entropy indicates more diverse, less predictable text.
125
+ AI-generated text often has lower entropy than human text.
126
+ """
127
+ if level == "word":
128
+ tokens = text.lower().split()
129
+ elif level == "character":
130
+ tokens = list(text.lower())
131
+ else:
132
+ raise ValueError("level must be 'word' or 'character'")
133
+
134
+ counts = Counter(tokens)
135
+ total = sum(counts.values())
136
+ probabilities = np.array([c / total for c in counts.values()])
137
+
138
+ entropy = -np.sum(probabilities * np.log2(probabilities + 1e-12))
139
+ max_entropy = np.log2(len(counts)) if len(counts) > 1 else 1.0
140
+ normalized_entropy = entropy / max_entropy
141
+
142
+ return {
143
+ "entropy": entropy,
144
+ "normalized_entropy": normalized_entropy,
145
+ "vocabulary_size": len(counts),
146
+ "total_tokens": total,
147
+ "type_token_ratio": len(counts) / total,
148
+ }
149
+
150
+ def compute_conditional_entropy(text: str, n: int = 2) -> float:
151
+ """Compute conditional entropy H(X_n | X_{n-1}) for n-gram analysis."""
152
+ words = text.lower().split()
153
+ if len(words) < n:
154
+ return 0.0
155
+
156
+ ngrams = [tuple(words[i:i+n]) for i in range(len(words) - n + 1)]
157
+ contexts = [ng[:-1] for ng in ngrams]
158
+
159
+ context_counts = Counter(contexts)
160
+ ngram_counts = Counter(ngrams)
161
+
162
+ h = 0.0
163
+ total = len(ngrams)
164
+ for ngram, count in ngram_counts.items():
165
+ context = ngram[:-1]
166
+ p_ngram = count / total
167
+ p_context = context_counts[context] / total
168
+ h -= p_ngram * np.log2(count / context_counts[context] + 1e-12)
169
+
170
+ return h
171
+ ```
172
+
173
+ ## AI Text Detection Pipeline
174
+
175
+ Combining perplexity, burstiness, and entropy into a detection pipeline:
176
+
177
+ ```python
178
+ def analyze_text_authenticity(text: str) -> dict:
179
+ """
180
+ Multi-signal analysis for AI vs. human text classification.
181
+ Uses perplexity, burstiness, and entropy as features.
182
+ """
183
+ perplexity_result = compute_perplexity(text)
184
+ burstiness_result = compute_burstiness(text)
185
+ entropy_result = compute_entropy(text, level="word")
186
+ char_entropy = compute_entropy(text, level="character")
187
+
188
+ # Heuristic thresholds from literature
189
+ signals = {
190
+ "low_perplexity": perplexity_result["perplexity"] < 30,
191
+ "low_burstiness": burstiness_result["average_burstiness"] < 0.1,
192
+ "low_entropy": entropy_result["normalized_entropy"] < 0.7,
193
+ "uniform_token_ppl": np.std(perplexity_result["token_perplexities"]) < 5,
194
+ }
195
+
196
+ ai_score = sum(signals.values()) / len(signals)
197
+
198
+ return {
199
+ "perplexity": perplexity_result["perplexity"],
200
+ "burstiness": burstiness_result["average_burstiness"],
201
+ "word_entropy": entropy_result["entropy"],
202
+ "char_entropy": char_entropy["entropy"],
203
+ "type_token_ratio": entropy_result["type_token_ratio"],
204
+ "ai_likelihood_score": ai_score,
205
+ "signals": signals,
206
+ }
207
+ ```
208
+
209
+ ## Tokenization Patterns
210
+
211
+ ```python
212
+ from transformers import AutoTokenizer
213
+
214
+ def compare_tokenizers(text: str, models: list = None) -> dict:
215
+ """Compare tokenization across different models for research analysis."""
216
+ if models is None:
217
+ models = ["gpt2", "bert-base-uncased", "facebook/opt-1.3b"]
218
+
219
+ results = {}
220
+ for model_name in models:
221
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
222
+ tokens = tokenizer.tokenize(text)
223
+ results[model_name] = {
224
+ "num_tokens": len(tokens),
225
+ "tokens": tokens[:50], # First 50 for inspection
226
+ "vocab_size": tokenizer.vocab_size,
227
+ "compression_ratio": len(text) / len(tokens),
228
+ }
229
+ return results
230
+ ```
231
+
232
+ ## Best Practices
233
+
234
+ - **Always specify the model** when computing perplexity. Perplexity is model-relative, not absolute.
235
+ - **Normalize by text length** when comparing entropy across texts of different sizes.
236
+ - **Use sliding windows** for long documents to capture local variation in metrics.
237
+ - **Combine multiple signals** for AI text detection -- no single metric is reliable alone.
238
+ - **Report confidence intervals** by computing metrics on paragraph-level chunks, then aggregating.
239
+ - **Be aware of domain shift.** Perplexity thresholds trained on news text will not transfer to scientific papers.
240
+
241
+ ## References
242
+
243
+ - [Hugging Face Transformers](https://huggingface.co/docs/transformers/) -- Model hub and tokenizer library
244
+ - [DetectGPT](https://arxiv.org/abs/2301.11305) -- Perplexity-based AI text detection (Mitchell et al., 2023)
245
+ - [Burstiness and Memory in Text](https://doi.org/10.1103/PhysRevLett.114.078101) -- Altmann et al., 2015
246
+ - [NLTK documentation](https://www.nltk.org/) -- Classic NLP toolkit for feature engineering
247
+ - [spaCy documentation](https://spacy.io/) -- Industrial-strength NLP for production pipelines