PyPI - folklore-index - Versions diffs - 0.1.0__tar.gz - Mend

folklore-index 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

folklore_index-0.1.0/PKG-INFO +23 -0
folklore_index-0.1.0/README.md +9 -0
folklore_index-0.1.0/folklore_index/__init__.py +33 -0
folklore_index-0.1.0/folklore_index/folklore_index.json +743 -0
folklore_index-0.1.0/folklore_index.egg-info/PKG-INFO +23 -0
folklore_index-0.1.0/folklore_index.egg-info/SOURCES.txt +8 -0
folklore_index-0.1.0/folklore_index.egg-info/dependency_links.txt +1 -0
folklore_index-0.1.0/folklore_index.egg-info/top_level.txt +1 -0
folklore_index-0.1.0/pyproject.toml +24 -0
folklore_index-0.1.0/setup.cfg +4 -0

folklore_index-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,23 @@
+Metadata-Version: 2.4
+Name: folklore-index
+Version: 0.1.0
+Summary: A runnable benchmark of widely-repeated AI / data-science claims, each ruled REPRODUCED / FAILED / NOT_COMPUTABLE.
+Author: Agora (autonomous research organization)
+License: CC-BY-4.0
+Project-URL: Homepage, https://dancenitra.github.io/agora/public/crucible/
+Project-URL: Source, https://github.com/DanceNitra/agora
+Keywords: ai,llm,evaluation,replication,reproducibility,benchmark,rag,agents,folklore,verification
+Classifier: Programming Language :: Python :: 3
+Classifier: Intended Audience :: Science/Research
+Requires-Python: >=3.8
+Description-Content-Type: text/markdown
+# folklore-index
+A standing, machine-readable benchmark of widely-repeated AI / data-science claims, each rebuilt as the smallest runnable test and ruled REPRODUCED / FAILED / NOT_COMPUTABLE. Honest, citable receipts for the field's folklore.
+```python
+import folklore_index as fi
+fi.verdicts()
+fi.get('FI-0001')
+```

folklore_index-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,9 @@
+# folklore-index
+A standing, machine-readable benchmark of widely-repeated AI / data-science claims, each rebuilt as the smallest runnable test and ruled REPRODUCED / FAILED / NOT_COMPUTABLE. Honest, citable receipts for the field's folklore.
+```python
+import folklore_index as fi
+fi.verdicts()
+fi.get('FI-0001')
+```

folklore_index-0.1.0/folklore_index/__init__.py ADDED Viewed

@@ -0,0 +1,33 @@
+"""The Folklore Index - a runnable benchmark of AI / data-science claims.
+>>> import folklore_index as fi
+>>> fi.verdicts()            # {'REPRODUCED': .., 'FAILED': .., 'NOT_COMPUTABLE': ..}
+>>> fi.get("FI-0001")        # one claim by its permanent citation key
+"""
+import json, os
+_DATA = os.path.join(os.path.dirname(__file__), "folklore_index.json")
+def load():
+    """Return the full dataset (metadata + entries)."""
+    with open(_DATA, encoding="utf-8") as f:
+        return json.load(f)
+def claims():
+    """Return the list of claim entries."""
+    return load()["entries"]
+def get(key):
+    """Return one claim by its permanent FI-NNNN key, or None."""
+    return next((e for e in claims() if e.get("key") == key), None)
+def verdicts():
+    """Return the verdict counts."""
+    return load()["counts"]["by_verdict"]
+__version__ = load().get("version", "0")

folklore_index-0.1.0/folklore_index/folklore_index.json ADDED Viewed

@@ -0,0 +1,743 @@
+{
+ "name": "The Folklore Index",
+ "version": "0.1.0",
+ "description": "A standing, machine-readable benchmark of widely-repeated AI / data-science claims, each rebuilt as the smallest runnable test and ruled REPRODUCED / FAILED / NOT_COMPUTABLE. Honest, citable receipts for the field's folklore.",
+ "homepage": "https://dancenitra.github.io/agora/public/crucible/",
+ "repo": "https://github.com/DanceNitra/agora",
+ "license": "CC-BY-4.0 (data) / MIT (code)",
+ "schema": {
+  "key": "stable citation id (FI-NNNN), permanent per claim",
+  "domain": "replication | ai-claim",
+  "claim": "the widely-repeated claim, stated precisely",
+  "source": "where the claim comes from / who repeats it",
+  "verdict": "REPRODUCED | FAILED | NOT_COMPUTABLE",
+  "note": "honest summary tying the measured result to the verdict, with scope/caveats",
+  "lab_file": "repo-relative path to the runnable test (may be empty)",
+  "code_url": "GitHub URL to the runnable test if it resolves",
+  "code_resolves": "whether the runnable test file is present in the repo",
+  "date": "YYYY-MM-DD"
+ },
+ "counts": {
+  "total": 59,
+  "by_verdict": {
+   "FAILED": 12,
+   "NOT_COMPUTABLE": 15,
+   "REPRODUCED": 32
+  },
+  "by_domain": {
+   "replication": 51,
+   "ai-claim": 8
+  },
+  "runnable_code_present": 10
+ },
+ "entries": [
+  {
+   "domain": "replication",
+   "claim": "Real-world networks are scale-free: their degree distributions follow a power law p(k) ~ k^-alpha.",
+   "source": "Barabasi-Albert (1999) framing, widely repeated; rigorous tests: Clauset, Shalizi & Newman 2009; Broido & Clauset 2019 'Scale-Free Networks Are Rare'.",
+   "verdict": "FAILED",
+   "note": "Under a rigorous Clauset-Shalizi-Newman fit (MLE alpha, KS-selected xmin, bootstrap goodness-of-fit, Vuong likelihood-ratio vs lognormal, n=20,000): a lognormal that 'looks scale-free' on a log-log plot is correctly REFUSED (power-law GOF p=0.01; LR favors lognormal, -17.5); a genuine Pareto power law passes (GOF p=0.92; LR +102); and a real Barabasi-Albert network is only a TIE (LR -0.1) - power law is not even clearly preferred over lognormal for true preferential attachment. So 'looks scale-free' is not 'is scale-free' and the universal claim is not safely inferable (Broido-Clauset reproduces). Power law IS reproduced for true BA graphs.",
+   "lab_file": "agora_output/lab/20260619-190000_crucible-scale-free-networks-rare.py",
+   "code_url": "https://github.com/DanceNitra/agora/blob/main/agora_output/lab/20260619-190000_crucible-scale-free-networks-rare.py",
+   "code_resolves": true,
+   "date": "2026-06-19",
+   "key": "FI-0001"
+  },
+  {
+   "domain": "replication",
+   "claim": "Emergent abilities of large language models are genuine, sharp capability transitions - a discontinuous jump in skill above a scale threshold.",
+   "source": "Wei et al. 2022 'Emergent Abilities of Large Language Models' (claim); counter: Schaeffer, Miranda & Koyejo 2023 'Are Emergent Abilities a Mirage?' (NeurIPS).",
+   "verdict": "FAILED",
+   "note": "The canonical SHARP 'emergence' curve is reproduced by a SMOOTH, continuous per-token skill measured with a nonlinear exact-match metric: the same smooth skill gives a per-token transition width 7.0 vs an exact-match (L=100) width 1.05 - 6.7x sharper PURELY from the metric - and the apparent onset shifts -0.07 -> +5.58 as answer length grows with NO change in the underlying skill. No capability discontinuity is needed, so benchmark sharpness is not evidence for one (Schaeffer's 'mirage' reproduces). Scope: shows sharpness is metric-dependent; it does not prove that no ability is ever genuinely emergent.",
+   "lab_file": "agora_output/lab/20260619-185500_crucible-emergent-abilities-metric-mirage.py",
+   "code_url": "https://github.com/DanceNitra/agora/blob/main/agora_output/lab/20260619-185500_crucible-emergent-abilities-metric-mirage.py",
+   "code_resolves": true,
+   "date": "2026-06-19",
+   "key": "FI-0002"
+  },
+  {
+   "domain": "replication",
+   "claim": "Degree distribution of competition-induced preferential attachment graphs is a power law up to a finite threshold and decays exponentially above it",
+   "source": "Degree Distribution of Competition-Induced Preferential Attachment Graphs (Berger, Borgs, Chayes, DSouza, 2005)",
+   "verdict": "NOT_COMPUTABLE",
+   "note": "The smallest faithful model of the stated mechanism (1D positions; new vertex connects to earlier vertex minimizing |dx| + alpha*depth) DEGENERATES across alpha to a near-star (max-degree 849->4059 growing with alpha, mean-degree 2, poor po",
+   "lab_file": "",
+   "code_url": "",
+   "code_resolves": false,
+   "date": "2026-06-18",
+   "key": "FI-0003"
+  },
+  {
+   "domain": "replication",
+   "claim": "A Pareto regret that reflects Pareto optimality without relying on scalarization functions can be defined and minimized in multi-objective bandits",
+   "source": "Pareto Regret Analyses in Multi-objective Multi-armed Bandit (Xu & Klabjan, 2022)",
+   "verdict": "REPRODUCED",
+   "note": "Smallest model: 2-objective bandit, known Pareto front (4 optimal + 2 dominated arms). Pareto-UCB1 with the scalarization-free Pareto suboptimality gap achieved sublinear cumulative Pareto regret (avg/step 0.027->0.005, growth factor 1.82 o",
+   "lab_file": "",
+   "code_url": "",
+   "code_resolves": false,
+   "date": "2026-06-18",
+   "key": "FI-0004"
+  },
+  {
+   "domain": "replication",
+   "claim": "The epidemic threshold disappears for scale-free / preferential-attachment networks with degree exponent gamma in (2,3]",
+   "source": "Pastor-Satorras & Vespignani (2001); cited by Jones & Handcock (2003), preferential attachment in human sexual networks",
+   "verdict": "REPRODUCED",
+   "note": "Mechanism reproduced: SIS threshold lambda_c=<k>/<k^2> falls with N on BA/scale-free networks (0.112->0.062 as N 500->32000) because <k^2> grows with the hub/cutoff (35.5->64.9), while ER stays finite (~0.20, <k^2>~20). SIS dynamics confirm",
+   "lab_file": "",
+   "code_url": "",
+   "code_resolves": false,
+   "date": "2026-06-18",
+   "key": "FI-0005"
+  },
+  {
+   "domain": "replication",
+   "claim": "Metcalfe's Law: the value of a communications network scales as n^2 (the number of possible pairwise connections)",
+   "source": "Metcalfe's Law (folklore/Gilder 1993); contested by Briscoe, Odlyzko & Tilly (2006), 'Metcalfe's Law is Wrong' (IEEE Spectrum), and Metcalfe (2013) Facebook dat",
+   "verdict": "FAILED",
+   "note": "Strong n^2 claim FAILS. Fitting alpha in V ~ n^alpha: the n^2 holds ONLY under the (unrealistic) assumption that every pairwise connection has EQUAL value (alpha=2.00). Under realistic rank-declining connection value (Zipf 1/k, the Briscoe-",
+   "lab_file": "",
+   "code_url": "",
+   "code_resolves": false,
+   "date": "2026-06-18",
+   "key": "FI-0006"
+  },
+  {
+   "domain": "replication",
+   "claim": "Diversity trumps ability: a random/diverse group of problem solvers outperforms (>=) a group of the best individual solvers (Hong & Page 2004 PNAS)",
+   "source": "Hong & Page (2004), Groups of diverse problem solvers can outperform groups of high-ability problem solvers, PNAS 101(46)",
+   "verdict": "FAILED",
+   "note": "STRONG claim (random>=best) NOT reproduced in a faithful, paired, statistically-powered minimal model (n=800 ring, pool=120, group=10, random group averaged over 5 draws, 25 landscapes). Random-minus-best is consistently NEGATIVE across heu",
+   "lab_file": "",
+   "code_url": "",
+   "code_resolves": false,
+   "date": "2026-06-17",
+   "key": "FI-0007"
+  },
+  {
+   "domain": "replication",
+   "claim": "Compactness and geometric stability conjectures for scalar curvature and convergence (Sormani et al., IAS 2018 workshop)",
+   "source": "Conjectures on Convergence and Scalar Curvature (Sormani et al., 2021)",
+   "verdict": "NOT_COMPUTABLE",
+   "note": "Open conjectures in geometric analysis (intrinsic-flat / Gromov-Hausdorff convergence under scalar-curvature bounds). No finite computational mechanism to reduce to a smallest model - these are deep manifold-convergence conjectures, not a s",
+   "lab_file": "",
+   "code_url": "",
+   "code_resolves": false,
+   "date": "2026-06-17",
+   "key": "FI-0008"
+  },
+  {
+   "domain": "replication",
+   "claim": "The survival probability of a branching process obeys finite-size scaling in the control parameter (offspring mean m=1+eps) and the max generations n: P_n(eps) ~ (1/n)*F(eps*n).",
+   "source": "Garcia-Millan, Font-Clos et al. 2015, \"Finite-size scaling of survival probability in branching processes\"",
+   "verdict": "REPRODUCED",
+   "note": "Smallest Galton-Watson model (Poisson offspring), vectorized over 40k realizations. Critical eps=0: n*P_n -> ~2 (Kolmogorov 2/sigma^2, sigma^2=1). Scaling collapse confirmed: n*P_n is a function of x=eps*n alone - within-x spread 0.116 vs a",
+   "lab_file": "",
+   "code_url": "",
+   "code_resolves": false,
+   "date": "2026-06-17",
+   "key": "FI-0009"
+  },
+  {
+   "domain": "replication",
+   "claim": "In the Erdos-Renyi graph G(N,p), the k-clique percolation transition occurs at p_c(k) = 1/[(k-1)N]^(1/(k-1)).",
+   "source": "Derenyi, Palla & Vicsek 2005, 'Clique Percolation in Random Networks' (PRL 94, 160202)",
+   "verdict": "REPRODUCED",
+   "note": "Smallest CPM model: enumerate k-cliques in G(N,p), union any sharing a (k-1)-clique, track largest community fraction R(p)=vertices_in_largest/N; empirical transition = p where R crosses half its max. Measured/formula ratio is near-CONSTANT",
+   "lab_file": "",
+   "code_url": "",
+   "code_resolves": false,
+   "date": "2026-06-17",
+   "key": "FI-0010"
+  },
+  {
+   "domain": "replication",
+   "claim": "The epidemic threshold of a network disappears (goes to 0 as N->infinity) when the degree-distribution scaling exponent rho is in 2 < rho <= 3.",
+   "source": "Jones & Handcock 2003, 'An assessment of preferential attachment as a mechanism for human sexual network formation' (after Pastor-Satorras & Vespignani 2001)",
+   "verdict": "REPRODUCED",
+   "note": "Smallest model: HMF threshold lambda_c=<k>/<k^2> from sampled power-law degree sequences vs N. Measured lambda_c at N=1e3..1e6: gamma=2.3 shrinks x28.7, gamma=2.7 x3.6 (power-law vanishing); gamma=3.0 is MARGINAL - vanishes only logarithmic",
+   "lab_file": "",
+   "code_url": "",
+   "code_resolves": false,
+   "date": "2026-06-17",
+   "key": "FI-0011"
+  },
+  {
+   "domain": "replication",
+   "claim": "Whether a mouse geroprotector is recorded as extending lifespan can depend on the survival statistic chosen (log-rank vs Gehan-Wilcoxon); and reporting the best of several survival tests inflates the",
+   "source": "deep-research 2026-06-16 + Jiang et al., GeroScience 2024 (Gehan reanalysis of NIA ITP data)",
+   "verdict": "REPRODUCED",
+   "note": "Self-contained sim (weighted log-rank family, n=50/arm, 4000 trials). Age-localized true effect: log-rank power 32.5% vs Gehan 72.3%; the two tests give DISCORDANT verdicts on 39.9% of identical datasets. Under the null, best-of-3 tests inf",
+   "lab_file": "",
+   "code_url": "",
+   "code_resolves": false,
+   "date": "2026-06-16",
+   "key": "FI-0012"
+  },
+  {
+   "domain": "replication",
+   "claim": "Specimens were loaded with low-amplitude cyclic torque to analyze deformation in the small strain range (0.001-0.01%).",
+   "source": "Optical Flow Method for Measuring Deformation of Soil Specimen Subjected to Torsional Shear (Srokosz, Bujko, Bochenska)",
+   "verdict": "NOT_COMPUTABLE",
+   "note": "Not a falsifiable claim with a result - it is a methods sentence describing a physical geotechnical experiment (optical-flow measurement of a soil specimen under cyclic torsional shear). Off-domain (soil mechanics, not our frontier), no res",
+   "lab_file": "",
+   "code_url": "",
+   "code_resolves": false,
+   "date": "2026-06-16",
+   "key": "FI-0013"
+  },
+  {
+   "domain": "replication",
+   "claim": "For BA networks (N=2,000, m=2), removing the top 10% of nodes by degree raises the bond-percolation threshold from p_c=0.174 to 0.776.",
+   "source": "Cachero Sanchez (2026), Simultaneous Degradation of Percolation and Cascade Robustness Under Targeted Hub Removal",
+   "verdict": "REPRODUCED",
+   "note": "Finite-size susceptibility-peak MC on the actual BA network reproduces it: intact p_c 0.170 (claimed 0.174), hub-removed 0.740 (claimed 0.776), both within ~5% over 6 replicas. Methodology lesson: the Molloy-Reed configuration-model estimat",
+   "lab_file": "",
+   "code_url": "",
+   "code_resolves": false,
+   "date": "2026-06-16",
+   "key": "FI-0014"
+  },
+  {
+   "domain": "replication",
+   "claim": "Cohen's benchmarks (r=.10/.30/.50, d=.20/.50/.80) are used to interpret observed effect sizes; in gerontology observed effects fall below them, so the benchmarks are miscalibrated (Brydges 2019).",
+   "source": "Effect Size Guidelines, Sample Size Calculations, and Statistical Power in Gerontology (Brydges, 2019)",
+   "verdict": "NOT_COMPUTABLE",
+   "note": "Cohen's cutoffs are conventions, not empirical predictions, so they cannot be reproduced. Brydges' substantive claim (observed gerontology effect sizes fall below the benchmarks) is an empirical distributional claim requiring his meta-analy",
+   "lab_file": "",
+   "code_url": "",
+   "code_resolves": false,
+   "date": "2026-06-16",
+   "key": "FI-0015"
+  },
+  {
+   "domain": "replication",
+   "claim": "Deep networks have enabled RL to scale to more complex domains, but these methods typically require large quantities of training data.",
+   "source": "Sample-Efficient RL with Maximum Entropy Mellowmax Episodic Control (Sarrico, Arulkumaran, Agostinelli et al.)",
+   "verdict": "NOT_COMPUTABLE",
+   "note": "Extracted sentence is a qualitative background premise (deep model-free RL is sample-inefficient) = a textbook fact, no crisp falsifiable mechanism; modelling it would re-derive known sample-inefficiency. The papers real contribution (Mello",
+   "lab_file": "",
+   "code_url": "",
+   "code_resolves": false,
+   "date": "2026-06-15",
+   "key": "FI-0016"
+  },
+  {
+   "domain": "replication",
+   "claim": "In general, critical systems are associated with fractal / power-law behaviour",
+   "source": "(general statistical-physics statement)",
+   "verdict": "NOT_COMPUTABLE",
+   "note": "Too general to replicate: no single specific quantitative claim with a computable mechanism, and criticality<->scale-invariance/power-laws is foundational textbook physics where FAILED is NOT a live possibility - reproducing a known critica",
+   "lab_file": "",
+   "code_url": "",
+   "code_resolves": false,
+   "date": "2026-06-15",
+   "key": "FI-0017"
+  },
+  {
+   "domain": "replication",
+   "claim": "Recursively synthesized, self-referential systems improve performance (e.g. Promptbreeder, Fernando et al. 2023: ~15pct few-shot gain via self-referential prompt evolution).",
+   "source": "Promptbreeder (Fernando, Banarse, Michalewski, Osindero, Rocktaschel, 2023, arXiv:2309.16797)",
+   "verdict": "REPRODUCED",
+   "note": "Smallest model (lab a08981): a population recursively synthesizes candidates FROM ITSELF each generation; the pivotal variable is the SELECTION signal. With an EXTERNAL fitness anchor (as Promptbreeder has: real task accuracy), recursive sy",
+   "lab_file": "",
+   "code_url": "",
+   "code_resolves": false,
+   "date": "2026-06-15",
+   "key": "FI-0018"
+  },
+  {
+   "domain": "replication",
+   "claim": "Compactness and geometric stability conjectures for scalar curvature convergence (intrinsic flat / GH convergence of Riemannian manifolds)",
+   "source": "Sormani et al., Conjectures on Convergence and Scalar Curvature (2021, arXiv:2103.10093)",
+   "verdict": "NOT_COMPUTABLE",
+   "note": "A survey of OPEN conjectures in geometric analysis (convergence of Riemannian manifolds under scalar-curvature bounds, intrinsic flat distance). No single claim with a finite computable mechanism to model; the content is conjectural/abstrac",
+   "lab_file": "",
+   "code_url": "",
+   "code_resolves": false,
+   "date": "2026-06-14",
+   "key": "FI-0019"
+  },
+  {
+   "domain": "replication",
+   "claim": "Finite-size scaling exists for the survival probability of a branching process as a function of the control parameter (m-1) and the maximum number of generations n: S_n ~ n^-1 G((m-1)*n).",
+   "source": "Garcia-Millan, Font-Clos & Corral (2015), Finite-size scaling of survival probability in branching processes (arXiv:1508.01515 / Phys. Rev. E 91, 042122).",
+   "verdict": "REPRODUCED",
+   "note": "Smallest model: exact survival curve of a Poisson(m) Galton-Watson process via PGF iteration q_n=exp(m(q_{n-1}-1)), S_n=1-q_n (zero Monte-Carlo noise). Three FSS signatures all confirmed: (1) exact critical amplitude n*S_n -> 2 = 2/sigma^2",
+   "lab_file": "",
+   "code_url": "",
+   "code_resolves": false,
+   "date": "2026-06-14",
+   "key": "FI-0020"
+  },
+  {
+   "domain": "replication",
+   "claim": "In DiD with FEW treated units and spatially/serially-correlated errors, standard DiD inference under-covers (invalid CIs) while synthetic control is materially better when parallel trends ~holds",
+   "source": "Alvarez & Ferman (2020)",
+   "verdict": "REPRODUCED",
+   "note": "N=30,T=12,1 treated,rho=0.7,true effect=0,800 reps: DiD 95% CI coverage=0.305 (severe under-coverage vs nominal 0.95); SC coverage=0.891; SC RMSE=1.017 < DiD RMSE=1.267. DiD inference invalid, SC materially better.",
+   "lab_file": "",
+   "code_url": "",
+   "code_resolves": false,
+   "date": "2026-06-14",
+   "key": "FI-0021"
+  },
+  {
+   "domain": "replication",
+   "claim": "p-hacking inflates Type I error in the error-statistical (Neyman-Pearson) approach but not in formal/likelihood inference that accounts for the selection",
+   "source": "Rubin (2026)",
+   "verdict": "REPRODUCED",
+   "note": "Under a true null (N=5000, n=30, K=5 forks): NP p-hacking (report min p) inflates Type I error 0.051->0.227 (+17.6pp, 4.45x, matches 1-.95^5=0.226). Formal/likelihood: a selection-ACCOUNTED likelihood (correct best-of-K sampling distributio",
+   "lab_file": "",
+   "code_url": "",
+   "code_resolves": false,
+   "date": "2026-06-14",
+   "key": "FI-0022"
+  },
+  {
+   "domain": "replication",
+   "claim": "Context-aware AI augmentation (adaptive type/timing/scale) preserves cognitive flow (difficulty-skill match); poorly-timed/scaled interventions disrupt flow and hurt reasoning performance",
+   "source": "Dissanayake & Nanayakkara (2025)",
+   "verdict": "NOT_COMPUTABLE",
+   "note": "Design/HCI framework, not a falsifiable empirical claim: a minimal flow model (flow=difficulty-skill match) confirms a gap-closing intervention by construction, so the directional result is near-tautological — no genuine FAILED was possible. Downgraded from a constructed-model REPRODUCED to keep the ledger honest.",
+   "lab_file": "",
+   "code_url": "",
+   "code_resolves": false,
+   "date": "2026-06-14",
+   "key": "FI-0023"
+  },
+  {
+   "domain": "replication",
+   "claim": "Removing top-10% degree nodes from a BA network (N=2000,m=2) raises the bond-percolation threshold p_c 0.174->0.776",
+   "source": "Simultaneous Degradation of Percolation and Cascade",
+   "verdict": "REPRODUCED",
+   "note": "Mechanism+direction reproduced: hub removal collapses <k^2> 50.8->3.8, p_c jumps ~8x (0.085->0.687 via Cohen mean-field). After-value within 12% of claim; before differs 2x (mean-field vs direct simulation). Robust-yet-fragile confirmed.",
+   "lab_file": "",
+   "code_url": "",
+   "code_resolves": false,
+   "date": "2026-06-13",
+   "key": "FI-0024"
+  },
+  {
+   "domain": "replication",
+   "claim": "Derenyi-Palla-Vicsek (2005): k-clique percolation in ER graphs at p_c(k)=[(k-1)N]^(-1/(k-1)); for k=3, p_c ~ 1/sqrt(2N)",
+   "source": "Clique Percolation in Random Networks (Derenyi, Palla, Vicsek, 2005)",
+   "verdict": "REPRODUCED",
+   "note": "k=3 scaling exponent confirmed: empirical p_c*sqrt(2N) constant across N=400/800/1600 (1.26,1.26,1.19). N^(-1/2) scaling reproduced; prefactor ~1.2x asymptotic formula due to finite-size + 50%-coverage operational threshold.",
+   "lab_file": "",
+   "code_url": "",
+   "code_resolves": false,
+   "date": "2026-06-13",
+   "key": "FI-0025"
+  },
+  {
+   "domain": "replication",
+   "claim": "Berger (2003): conditional-frequentist testing reconciles Fisher/Neyman/Jeffreys - the conditional error probability equals the Bayesian posterior; the naive p-value overstates evidence against H0",
+   "source": "Berger (2003), Could Fisher, Jeffreys and Neyman Have Agreed on Testing?",
+   "verdict": "REPRODUCED",
+   "note": "Calibration exact: empirical freq(H0 true | evidence) = Bayesian P(H0|x) across all bins. Berger-Sellke: p=0.05 -> P(H0|x)=0.216, p=0.005 -> 0.041 (p-value overstates evidence ~4x at 0.05).",
+   "lab_file": "",
+   "code_url": "",
+   "code_resolves": false,
+   "date": "2026-06-13",
+   "key": "FI-0026"
+  },
+  {
+   "domain": "replication",
+   "claim": "Systems in the same universality class share the same critical exponents (Lubeck 2004)",
+   "source": "Universal Scaling Behavior of Non-Equilibrium Phase Transitions (Sven Lubeck, 2004)",
+   "verdict": "REPRODUCED",
+   "note": "Three structurally different Z2 mean-field models (tanh self-consistency, phi^4 free energy, arctan self-consistency) all give order-parameter exponent beta=0.500; a different-class absorbing-state model gives beta=1.000. Same class -> same",
+   "lab_file": "",
+   "code_url": "",
+   "code_resolves": false,
+   "date": "2026-06-12",
+   "key": "FI-0027"
+  },
+  {
+   "domain": "replication",
+   "claim": "LinUCB (Chu et al 2011): linear contextual bandit achieves O(sqrt(Td log^3)) i.e. sublinear ~sqrt(T) regret w.p. 1-delta",
+   "source": "Contextual Bandits with Linear Payoff Functions (Chu, Li, Reyzin, Schapire, 2011)",
+   "verdict": "REPRODUCED",
+   "note": "Empirical regret growth exponent 0.03-0.11 (sub-sqrt(T), well inside the O(sqrt(T)) upper bound); cum regret 3-8 vs linear non-learning regret 1400-2700; d-scaling ~sqrt(d) to d. Computable core holds.",
+   "lab_file": "",
+   "code_url": "",
+   "code_resolves": false,
+   "date": "2026-06-12",
+   "key": "FI-0028"
+  },
+  {
+   "domain": "replication",
+   "claim": "BA(N=2000,m=2): removing top-10% nodes by degree raises bond-percolation threshold p_c 0.174 -> 0.776 (~4.5x)",
+   "source": "Simultaneous Degradation of Percolation and Cascade Robustness Under Targeted Hub Removal (Cachero Sanchez, 2026-03-05)",
+   "verdict": "REPRODUCED",
+   "note": "MC before/after = 0.150/0.680 (ratio 4.53x) vs claimed ratio 4.46x; absolute values within finite-size tolerance. Cascade claim at phi=0.22 out of scope.",
+   "lab_file": "",
+   "code_url": "",
+   "code_resolves": false,
+   "date": "2026-06-12",
+   "key": "FI-0029"
+  },
+  {
+   "domain": "replication",
+   "claim": "Dunning & Kruger (1999): the unskilled systematically overestimate their ability - a metacognitive deficit, shown by the quartile plot (bottom quartile +46pp, top -13pp)",
+   "source": "Kruger & Dunning, J Pers Soc Psychol 77(6), 1999 - Unskilled and Unaware of It",
+   "verdict": "FAILED",
+   "note": "A null model with ZERO metacognitive deficit (identical self-error at every skill level) reproduces the canonical plot AND its asymmetry: bottom +45.8 (DK: +46), top -14.2 (DK: -13) - just regression to the mean on a noisy test + a UNIFORM",
+   "lab_file": "",
+   "code_url": "",
+   "code_resolves": false,
+   "date": "2026-06-12",
+   "key": "FI-0030"
+  },
+  {
+   "domain": "replication",
+   "claim": "Miller (1957): random typing (letters + space) produces Zipf law, so Zipf law in language carries no linguistic significance",
+   "source": "Miller, American Journal of Psychology 70, 1957 - Some effects of intermittent silence",
+   "verdict": "REPRODUCED",
+   "note": "Random-typing corpus (443k words): rank-frequency exponent -1.24, an approximate power law - the math reproduces. Severe test of a proposed counter (staircase fine-structure discriminator, tie fraction of adjacent ranks): 0.89 vs 0.48 at ma",
+   "lab_file": "",
+   "code_url": "",
+   "code_resolves": false,
+   "date": "2026-06-12",
+   "key": "FI-0031"
+  },
+  {
+   "domain": "replication",
+   "claim": "Evans & Archer (1968) / textbook canon: a portfolio of ~20-30 stocks achieves practically complete diversification - marginal benefit beyond 30 is negligible",
+   "source": "Evans & Archer, Journal of Finance 23(4), 1968 - Diversification and the reduction of dispersion",
+   "verdict": "REPRODUCED",
+   "note": "Holds at realistic tails: N=30 captures 95.6-96.9% of achievable vol AND ES99 reduction (normal and t2.5 idio + 1-factor). MEASURED BOUNDARY: near infinite-variance idio tails (t1.8) the canon breaks - only 85.4% of tail-risk reduction capt",
+   "lab_file": "",
+   "code_url": "",
+   "code_resolves": false,
+   "date": "2026-06-12",
+   "key": "FI-0032"
+  },
+  {
+   "domain": "replication",
+   "claim": "Social media platforms enable rapid dissemination of information but also facilitate the spread of misinformation",
+   "source": "internal corp finding (descriptive)",
+   "verdict": "NOT_COMPUTABLE",
+   "note": "Descriptive trend claim with no stated mechanism or quantity; the simulable core (false-news cascades spread farther/faster, Vosoughi 2018) is a DIFFERENT, specific claim - queued as a future Crucible target rather than strawmanned here.",
+   "lab_file": "",
+   "code_url": "",
+   "code_resolves": false,
+   "date": "2026-06-12",
+   "key": "FI-0033"
+  },
+  {
+   "domain": "replication",
+   "claim": "Gilovich, Vallone & Tversky (1985): the basketball hot hand is a cognitive illusion - P(hit | streak of hits) is no higher than P(hit | streak of misses), so streak shooting does not exist",
+   "source": "Gilovich, Vallone & Tversky, Cognitive Psychology 17, 1985 - The hot hand in basketball: On the misperception of random sequences",
+   "verdict": "FAILED",
+   "note": "The canonical method manufactures its own conclusion. An iid shooter with NO hot hand shows P(hit|3H)-P(hit|3M) = -7.9pp (t=-28) under GVT estimator (n=100) - the streak-selection bias of Miller & Sanjurjo (2018). Bias grows to -17pp at k=4",
+   "lab_file": "",
+   "code_url": "",
+   "code_resolves": false,
+   "date": "2026-06-12",
+   "key": "FI-0034"
+  },
+  {
+   "domain": "replication",
+   "claim": "Hong-Page (2004): a randomly selected (diverse) team of problem solvers outperforms the team of individually best solvers under relay search (n=2000, l=12, k=3, group 10)",
+   "source": "Hong & Page, PNAS 101(46), 2004 - Groups of diverse problem solvers can outperform groups of high-ability problem solvers",
+   "verdict": "REPRODUCED",
+   "note": "Reproduced at paper params: random beats best by +1.65 (t=4.1). But the effect is FRAGILE, as critics argued: it shrinks with a narrower heuristic pool (l=6: +0.90) and REVERSES on smoothed landscapes (-0.38, ability wins) - diversity trump",
+   "lab_file": "",
+   "code_url": "",
+   "code_resolves": false,
+   "date": "2026-06-12",
+   "key": "FI-0035"
+  },
+  {
+   "domain": "replication",
+   "claim": "SGD locally converges to a minimum of a non-convex objective with a quantifiable rate",
+   "source": "Fehrman, Gess, Jentzen — Convergence rates for SGD for non-convex objective functions",
+   "verdict": "REPRODUCED",
+   "note": "Double-well (x^2-1)^2 (non-convex, minima +-1, max at 0). SGD noisy gradient, step c/(t+t0) started in the x*=1 basin: E[(x-1)^2] decays with fitted exponent -1.00 = theory O(1/t); constant step stalls at a variance floor. Local convergence",
+   "lab_file": "",
+   "code_url": "",
+   "code_resolves": false,
+   "date": "2026-06-12",
+   "key": "FI-0036"
+  },
+  {
+   "domain": "replication",
+   "claim": "Generative AI integration into software engineering is rapidly expanding/automating tasks",
+   "source": "On agent-based software engineering (Jennings 2000)",
+   "verdict": "NOT_COMPUTABLE",
+   "note": "Garbled target: the claim is a malformed finding fragment (a vague expansion trend, no mechanism to model), and the cited source (Jennings 2000, agent-based SE) PREDATES and does not support a GenAI claim. No computable core; mismatched cit",
+   "lab_file": "",
+   "code_url": "",
+   "code_resolves": false,
+   "date": "2026-06-12",
+   "key": "FI-0037"
+  },
+  {
+   "domain": "replication",
+   "claim": "Critical systems are associated with fractal or power-law scaling and long-range correlations",
+   "source": "Kitzbichler, Smith, Rahn (2009) Broadband Criticality of Human Brain Network Synchronization",
+   "verdict": "REPRODUCED",
+   "note": "Smallest model = critical branching process (neuronal avalanches). At sigma=1 avalanche-size P(s)~s^-1.42 (mean-field 3/2); off-critical the power law vanishes (sub: cutoff tau 2.45; super: 9.4pct runaway). Mean size diverges 6.7->1586 appr",
+   "lab_file": "",
+   "code_url": "",
+   "code_resolves": false,
+   "date": "2026-06-11",
+   "key": "FI-0038"
+  },
+  {
+   "domain": "replication",
+   "claim": "In the low-data regime you can learn online policies with 2-10x fewer total coefficients with little to no loss of performance.",
+   "source": "Biologically inspired architectures for sample-efficient deep RL (Richemond, Kolbeinsson et al.)",
+   "verdict": "REPRODUCED",
+   "note": "Smallest model: supervised low-data fit (16 training points, smooth target, polynomial basis). Test MSE by coefficient count K: K=6 -> 0.015 (best); K=14 -> 3009 (catastrophic overfit). Shrinking from K=14 to K=6 = 2.3x fewer coefficients a",
+   "lab_file": "",
+   "code_url": "",
+   "code_resolves": false,
+   "date": "2026-06-11",
+   "key": "FI-0039"
+  },
+  {
+   "domain": "replication",
+   "claim": "In scale-free networks the epidemic threshold vanishes as N grows (lambda_c = <k>/<k^2> -> 0 via the <k^2> divergence), unlike homogeneous networks which keep a finite threshold.",
+   "source": "Epidemic spreading in scale-free networks (Pastor-Satorras & Vespignani, 2001)",
+   "verdict": "REPRODUCED",
+   "note": "Smallest model: heterogeneous mean-field SIS threshold lambda_c=<k>/<k^2>, BA (scale-free) vs ER (homogeneous) at mean degree ~4. BA <k^2> grows with N (45->63 over N=500->20000) so lambda_c shrinks 0.088->0.064, heading to 0 = threshold VA",
+   "lab_file": "",
+   "code_url": "",
+   "code_resolves": false,
+   "date": "2026-06-11",
+   "key": "FI-0040"
+  },
+  {
+   "domain": "replication",
+   "claim": "LinUCB-style linear contextual bandit achieves sublinear regret, O(sqrt(Td) polylog), holding with high probability (Chu, Li, Reyzin, Schapire 2011).",
+   "source": "Contextual Bandits with Linear Payoff Functions (Chu, Li, Reyzin, Schapire, 2011)",
+   "verdict": "REPRODUCED",
+   "note": "Smallest model: LinUCB on a d=6, K=12 linear contextual bandit, reward = x.theta*+noise. Cumulative regret stays ~flat (1.5->1.8 from T=500->4000, log-log exponent 0.09) - clearly SUBLINEAR and far inside the O(sqrt(dT))~155 bound at T=4000",
+   "lab_file": "",
+   "code_url": "",
+   "code_resolves": false,
+   "date": "2026-06-11",
+   "key": "FI-0041"
+  },
+  {
+   "domain": "replication",
+   "claim": "SGD is popular for large-scale optimization but has slow asymptotic convergence due to the inherent variance of the stochastic gradient.",
+   "source": "Accelerating Stochastic Gradient Descent using Predictive Variance Reduction (Johnson & Zhang, 2013)",
+   "verdict": "REPRODUCED",
+   "note": "Smallest model: minimize a strongly-convex least-squares (scalar w) with label noise. Constant-step SGD stalls at a variance FLOOR (suboptimality ~1.9e-3, never reaches the optimum); full-gradient GD and variance-reduced SVRG both drive sub",
+   "lab_file": "",
+   "code_url": "",
+   "code_resolves": false,
+   "date": "2026-06-11",
+   "key": "FI-0042"
+  },
+  {
+   "domain": "replication",
+   "claim": "Branching processes exhibit finite-size scaling of the survival probability P_n as a function of the control parameter m and the maximum number of generations n (critical n*P_n -> const, collapse in (",
+   "source": "Finite-size scaling of survival probability in branching processes (Garcia-Millan, Font-Clos, Corral, 2015)",
+   "verdict": "REPRODUCED",
+   "note": "Smallest model: Galton-Watson with Poisson(m) offspring, survival = population>0 at gen n. Both predictions reproduce: (a) at criticality m=1, n*P_n converges to ~1.9 (theory 2/sigma^2=2, sigma^2=1) -> P_n~C/n, exponent -1; (b) finite-size-",
+   "lab_file": "",
+   "code_url": "",
+   "code_resolves": false,
+   "date": "2026-06-11",
+   "key": "FI-0043"
+  },
+  {
+   "domain": "replication",
+   "claim": "BA networks N=2000 m=2: removing the top 10% of nodes by degree raises the bond-percolation threshold from p_c=0.174 to 0.776 (severalfold robustness loss under targeted hub removal).",
+   "source": "Simultaneous Degradation of Percolation and Cascade Robustness Under Targeted Hub Removal (Cachero Sanchez, 2026-03-05)",
+   "verdict": "REPRODUCED",
+   "note": "Smallest model: BA(N=2000,m=2), Molloy-Reed p_c=<k>/(<k^2>-<k>) on the empirical degree sequence + a direct bond-percolation sweep (union-find, giant>50%). MECHANISM REPRODUCED: targeted top-10% hub removal raises the threshold several-fold",
+   "lab_file": "",
+   "code_url": "",
+   "code_resolves": false,
+   "date": "2026-06-11",
+   "key": "FI-0044"
+  },
+  {
+   "domain": "replication",
+   "claim": "Brynjolfsson & Hitt (2000): organizational investments have a large influence on the value of IT investments (IT/org-capital complementarity).",
+   "source": "Beyond Computation: Information Technology, Organizational Transformation and Business Performance (Brynjolfsson & Hitt, 2000)",
+   "verdict": "REPRODUCED",
+   "note": "Smallest model: logY=a*IT+b*ORG+g*(IT*ORG). Recovered g~0.06; omitting org capital inflates apparent IT return, bias growing with co-investment corr (naive b_IT=0.21 vs true 0.12 at rho=0.8). Reproduces their methodological warning. Falsifi",
+   "lab_file": "",
+   "code_url": "",
+   "code_resolves": false,
+   "date": "2026-06-11",
+   "key": "FI-0045"
+  },
+  {
+   "domain": "replication",
+   "claim": "Toutanova-Chen (2015): a simple observed-features model matches or beats latent-feature (embedding) models on knowledge-base completion",
+   "source": "Toutanova and Chen (2015)",
+   "verdict": "REPRODUCED",
+   "note": "On a synthetic KB with planted compositional structure (target relation = r0->r1 path), the OBSERVED-features model (2-hop path + degree) scored AUC 0.966 vs the LATENT SVD model 0.749 (+0.216). Path features capture composition that low-ra",
+   "lab_file": "",
+   "code_url": "",
+   "code_resolves": false,
+   "date": "2026-06-11",
+   "key": "FI-0046"
+  },
+  {
+   "domain": "replication",
+   "claim": "Barato-Hinrichsen (2009): non-Markovian spreading with heavy-tailed waiting times t^{-1-m} shows an absorbing-state phase transition",
+   "source": "Barato and Hinrichsen (2009)",
+   "verdict": "REPRODUCED",
+   "note": "Minimal non-Markovian branching simulation (lab 37ad2a) reproduces the absorbing-state transition: survival ~0 for branching ratio lambda<1, rises for lambda>1, threshold lambda_c~1 IDENTICAL for m=0.5 and m=1.5. The heavy tail sets finite-",
+   "lab_file": "",
+   "code_url": "",
+   "code_resolves": false,
+   "date": "2026-06-11",
+   "key": "FI-0047"
+  },
+  {
+   "domain": "replication",
+   "claim": "Safe-Sora (Su et al. 2025) addresses invisible generative watermarking for video by embedding graphical watermarks",
+   "source": "A Comprehensive Review (survey)",
+   "verdict": "NOT_COMPUTABLE",
+   "note": "Descriptive existence claim (a framework ADDRESSES an area), not a quantitative result - nothing to re-run as a minimal model. Replication needs a measured number (e.g. detection AUC, bit-error rate under compression), which this survey-sou",
+   "lab_file": "",
+   "code_url": "",
+   "code_resolves": false,
+   "date": "2026-06-11",
+   "key": "FI-0048"
+  },
+  {
+   "domain": "replication",
+   "claim": "Duration of liquid-to-glass transition in a supercooled metallic alloy decreases exponentially with [truncated]",
+   "source": "Thermodynamic model for the glass transition (truncated)",
+   "verdict": "NOT_COMPUTABLE",
+   "note": "The claims dependent variable is truncated (decreases exponentially WITH WHAT is cut off) and it is flagged UNCERTAIN/unsettled by the literature - no precise quantitative relationship to re-run. Note: canonical supercooled relaxation is su",
+   "lab_file": "",
+   "code_url": "",
+   "code_resolves": false,
+   "date": "2026-06-11",
+   "key": "FI-0049"
+  },
+  {
+   "domain": "replication",
+   "claim": "The AI-biology analogy breaks because brains use local Hebbian synaptic homeostasis",
+   "source": "internal frontier",
+   "verdict": "NOT_COMPUTABLE",
+   "note": "Already resolved by dialectic-where-ai-biology-analogy-breaks.md (breaks at the learning rule; backprop has no measured weight transport, learning survives random feedback). A verbal break-claim with no quantitative model to re-run.",
+   "lab_file": "",
+   "code_url": "",
+   "code_resolves": false,
+   "date": "2026-06-10",
+   "key": "FI-0050"
+  },
+  {
+   "domain": "replication",
+   "claim": "A multi-agent RL model trained on historical Agora transaction data predicts trader behavior",
+   "source": "AGORA high-resolution galaxy simulations (mis-cited - name collision)",
+   "verdict": "NOT_COMPUTABLE",
+   "note": "No computable core: claim references internal/fictional Agora transaction data we do not have, and the cited source is a mis-attributed galaxy-simulation paper (AGORA name collision). Nothing real to replicate.",
+   "lab_file": "",
+   "code_url": "",
+   "code_resolves": false,
+   "date": "2026-06-10",
+   "key": "FI-0051"
+  },
+  {
+   "domain": "ai-claim",
+   "claim": "Multi-agent systems (an orchestrator delegating to sub-agents) outperform a single agent on complex tasks.",
+   "source": "2026 agent-engineering folklore (contested: Cognition 'Don't Build Multi-Agents' vs Anthropic's multi-agent research system); Gartner: ~40% of agentic projects scrapped.",
+   "verdict": "NOT_COMPUTABLE",
+   "note": "Tested at FIXED token cost on decomposable ground-truth tasks (one base model, only the scaffold differs; single self-consistency vs an orchestrator+workers+aggregator scaffold). Run 3x: the SAME config swings 0.15-0.20 in accuracy run-to-run on IDENTICAL tasks (multi@wmax1400: 0.75/0.75/0.92; single k=5: 0.96/0.75) at n=24, temp 0.7. The comparison is dominated by LLM-stochasticity noise at realistic eval scale -> the claim cannot be cleanly settled, AND the loud folklore (in both directions) rests on under-powered, non-reproducible comparisons. A stable verdict would need large-n multi-seed evals, which the folklore/blog benchmarks do not run. Honest scope: our own eval is also too small for a positive claim; the finding is the NON-REPRODUCIBILITY itself.",
+   "lab_file": "agora_output/aiclaims/20260619_multi_vs_single_agent_eval.py",
+   "code_url": "https://github.com/DanceNitra/agora/blob/main/agora_output/aiclaims/20260619_multi_vs_single_agent_eval.py",
+   "code_resolves": true,
+   "date": "2026-06-19",
+   "key": "FI-0052"
+  },
+  {
+   "domain": "ai-claim",
+   "claim": "When you evaluate N models/configs on a benchmark and report the top scorer's number, that score is a reliable estimate of its true performance and the winner is the truly-best model.",
+   "source": "Universal AI benchmarking / leaderboard practice (SOTA reporting, hyperparameter + model selection on a held-out benchmark).",
+   "verdict": "FAILED",
+   "note": "Winner's curse (selection-on-the-max), clean DETERMINISTIC model (no LLM noise - the lesson from entry #1): N models with true accuracies clustered within sigma_true=0.04, each measured with eval noise (finite test set + run-to-run variance). At N=50 models and eval-noise SE 0.06 (~ a 50-200 item benchmark): the reported winner's score is inflated by +0.112 (the SOTA bar is overstated), the observed winner is the truly-best model only 17% of the time, and trusting it costs 0.040 true accuracy vs the real best. Both effects GROW with the number of candidates and the eval noise (P(true best): N=5 -> 62%, N=100 -> 13%). So 'the leaderboard winner's score is reliable' FAILS: more candidates + noisier evals = more inflated, less trustworthy. (Ties to entry #1: the same eval noise that makes multi-vs-single non-reproducible is what drives this inflation.)",
+   "lab_file": "agora_output/aiclaims/20260619_benchmark_winners_curse.py",
+   "code_url": "https://github.com/DanceNitra/agora/blob/main/agora_output/aiclaims/20260619_benchmark_winners_curse.py",
+   "code_resolves": true,
+   "date": "2026-06-19",
+   "key": "FI-0053"
+  },
+  {
+   "domain": "ai-claim",
+   "claim": "Retrieval-augmented frontier models weigh a retrieved document against their own knowledge - they won't blindly adopt a doc that contradicts what they correctly know.",
+   "source": "RAG-robustness folklore; measured on real models via the Grounding Firewall poison protocol (Agora). The Poison-Deference Index.",
+   "verdict": "FAILED",
+   "note": "Poison-Deference Index: 12 factual questions each model answered CORRECTLY without context, then given a context asserting the WRONG answer (real LLMs, k=3 order-corrected, thinking-robust reader). deepseek-v4-flash: PDR=92% (flips to the false answer on 11/12 questions it knew), CPR=83% (confidently wrong). glm-5.2: PDR=92%, CPR=92%. BOTH frontier models abandon their correct knowledge for a planted-false doc ~92% of the time, almost always confidently - retrieval OVERRIDES rather than augments. (Strong-assertion poison; 2 models so far - a live index across more models is the next step. Mitigation already exists: the Grounding Firewall's abstain-on-high-sensitivity gate catches exactly these.)",
+   "lab_file": "agora_output/aiclaims/20260619_poison_deference_index.py",
+   "code_url": "https://github.com/DanceNitra/agora/blob/main/agora_output/aiclaims/20260619_poison_deference_index.py",
+   "code_resolves": true,
+   "date": "2026-06-19",
+   "key": "FI-0054"
+  },
+  {
+   "domain": "ai-claim",
+   "claim": "AI agent success decays with a CONSTANT hazard (an exponential 'half-life') as the task gets longer.",
+   "source": "arXiv 2505.05115 'Is there a half-life for the success rates of AI agents?'; tested on METR time-horizon data (metr.org/blog/2025-03-19; epoch.ai/benchmarks/metr-time-horizons).",
+   "verdict": "REPRODUCED",
+   "note": "Fit on METR's real public anchors (success vs human-task-length: ~99% @ 4min, ~80% @ 15min, 50% @ 60min [Claude 3.7], <10% @ 240min). A constant-hazard exponential (P = 0.5^(t/60min)) fits well: predicts 0.95/0.84/0.50/0.06 vs observed 0.99/0.80/0.50/0.08, SSE 0.0032 - actually BETTER than the logistic-in-log-time (SSE 0.0075, which overshoots the tail). The half-life claim holds. Minor tension: the t80/t50 ratio is slightly steeper than exponential (0.250 vs 0.322), hinting at mild extra steepening, but 4 anchor points can't resolve it. HONEST SCOPE: public anchors only (Claude 3.7 + aggregate); METR's raw per-task data would sharpen the per-model hazard.",
+   "lab_file": "agora_output/aiclaims/20260619_metr_horizon_survival_multiverse.py",
+   "code_url": "https://github.com/DanceNitra/agora/blob/main/agora_output/aiclaims/20260619_metr_horizon_survival_multiverse.py",
+   "code_resolves": true,
+   "date": "2026-06-19",
+   "key": "FI-0055"
+  },
+  {
+   "domain": "ai-claim",
+   "claim": "The 'AI time horizon' is a robust headline number (supporting 'AI will automate month-long tasks within ~5 years').",
+   "source": "METR time-horizon headline + 7-month-doubling extrapolation (metr.org/blog/2025-03-19); multiverse / specification-curve method (One Model Many Scores, arXiv 2308.16681).",
+   "verdict": "FAILED",
+   "note": "Multiverse over ONE analytic fork METR itself exposes - the success-threshold choice. From the fitted curve on METR's real anchors the horizon is 60 min at 50% success but 21 min at 80% (and 170 min at 20%): a 2.8x swing from an arbitrary threshold. At doubling-every-7-months that 2.8x is ~11 months of apparent 'progress' - so the famous 'month-long tasks in ~5 years' timeline slips ~11 months if you (reasonably) demand 80% reliability instead of 50%. The headline is NOT a robust single number; it rides on an unstated analytic choice. (A full specification curve over more forks needs METR's raw per-task data.)",
+   "lab_file": "agora_output/aiclaims/20260619_metr_horizon_survival_multiverse.py",
+   "code_url": "https://github.com/DanceNitra/agora/blob/main/agora_output/aiclaims/20260619_metr_horizon_survival_multiverse.py",
+   "code_resolves": true,
+   "date": "2026-06-19",
+   "key": "FI-0056"
+  },
+  {
+   "domain": "ai-claim",
+   "claim": "LLMs inherit human cognitive biases - e.g. conservatism in Bayesian belief updating (people under-revise relative to Bayes; Edwards 1968).",
+   "source": "Folklore that LLMs reproduce human judgment-and-decision biases; tested on the canonical bookbag-and-poker-chip conservatism task with real models.",
+   "verdict": "FAILED",
+   "note": "On the exact task where humans are reliably conservative (two equally-likely sources, symmetric cue validity q=0.70, an R/B signal sequence; Bayesian posterior ~0.97), both frontier models return the EXACT Bayesian posterior when the likelihoods are specified: deepseek-v4-flash mean conservatism gap = -0.001 (4/5 items parsed), glm-5.2 = +0.000 (5/5). Humans on the same task report ~0.70 (a large conservative gap); the LLMs show ~0.000. The bias does NOT transfer - handed the diagnosticity as numbers, the model just computes the likelihood ratio. HONEST SCOPE: only the SPECIFIED-likelihood case is settled. The human-analogue INFER case (model must ESTIMATE cue validity from a training sample, as humans do) is NOT cleanly measurable here - the harder estimate-then-update task exhausts the thinking models' token budget before a parseable answer (1/10 parsed), and forcing answer-first would suppress the reasoning the task needs. So: conservatism does not appear when cue validity is told; whether it appears when cue validity must be learned is open.",
+   "lab_file": "agora_output/lab/20260620-035500_llm-conservatism-cue-validity-specification.py",
+   "code_url": "https://github.com/DanceNitra/agora/blob/main/agora_output/lab/20260620-035500_llm-conservatism-cue-validity-specification.py",
+   "code_resolves": true,
+   "date": "2026-06-20",
+   "key": "FI-0057"
+  },
+  {
+   "domain": "ai-claim",
+   "claim": "Smaller chunks improve RAG retrieval quality - 'when in doubt, chunk smaller' raises precision/relevance.",
+   "source": "Common RAG-engineering folklore (chunk-size tuning advice; 'smaller chunks = higher precision').",
+   "verdict": "FAILED",
+   "note": "Deterministic numpy test: a 200-token document with one CONTIGUOUS gold span (length 30-50), fixed-grid chunking, each chunk scored by gold density (the precision force that rewards small chunks), retrieve top-k=3, measure recovery = gold tokens recovered / span length. Smaller is NOT better: the smallest chunk c=10 recovers only 0.750 of a 40-token span while c>=20 recovers 1.000 - the minimum chunk size is the WORST, and the optimum sits AT/ABOVE the span scale (best chunk = 20-40) in 9/9 robustness cells (seeds x span length). Mechanism: small chunks do maximize per-chunk density, but a grid cut fragments a contiguous span into more pieces than the top-k=3 budget can reassemble, and that recall loss dominates the precision gain. Anti-rig CONTROL: retrieving ALL chunks (no budget) flattens recovery to exactly 1.000 for every chunk size - proving the reversal is caused by the retrieval budget, not baked into the data. HONEST SCOPE: a noiseless density oracle, binary relevance, a single contiguous span, fixed grid boundaries - it isolates the fragmentation-vs-precision tradeoff, not embedding noise or overlapping/recursive chunkers. Conditional takeaway: 'smaller = better' is false whenever relevant evidence is contiguous and longer than a chunk under a tight top-k budget. (Generated + adversarially verified by an Agora workflow: 1 of 4 candidate claims survived 2 independent skeptic referees; the other 3 were killed as textbook/Condorcet or stipulated-geometry.)",
+   "lab_file": "agora_output/aiclaims/20260620_rag_chunk_size_not_monotone.py",
+   "code_url": "https://github.com/DanceNitra/agora/blob/main/agora_output/aiclaims/20260620_rag_chunk_size_not_monotone.py",
+   "code_resolves": true,
+   "date": "2026-06-20",
+   "key": "FI-0058"
+  },
+  {
+   "domain": "ai-claim",
+   "claim": "Adding a reranker (cross-encoder) on top of first-stage retrieval reliably improves end-to-end RAG accuracy, or at worst never hurts it ('drop in a reranker for a free boost').",
+   "source": "Cohere / Pinecone / LangChain / LlamaIndex RAG tutorials and 'production RAG checklist' blogs; repeated as received wisdom by practitioners.",
+   "verdict": "FAILED",
+   "note": "Deterministic numpy model (seed 0, n=200k queries): gold doc + 3 hard negatives (lexically similar) + 27 soft distractors; NO-RERANK uses a noisy first-stage scorer, RERANK uses a CLEANER scorer but inflates the 3 hard negatives by `infl` (a cross-encoder fooled by lexical overlap); end-to-end correctness depends on the gold doc's rank in a position-biased top-5. Measured delta (rerank - no-rerank): infl=0.0 -> +0.139, 1.4 -> +0.013, 2.2 -> -0.074, 3.0 -> -0.131. The reranker HELPS when it is merely cleaner (control infl=0: +0.139) but crosses to a NET LOSS at infl* ~ 1.52 and reaches -0.131 - so 'a reranker never hurts / free boost' is FAILED: a second scorer with a different error profile can demote the gold doc out of a position-biased top-k. Anti-rig: the control (infl=0) shows the harness CAN produce a gain (not baked to fail); the verdict is read off the printed curve. HONEST SCOPE: a stylized simulation, not real corpora/models - it shows the loss is mechanically in-range and the condition for it (non-trivial hard-negative susceptibility), not that any specific production reranker degrades a specific stack; the realistic magnitude of `infl` for real cross-encoders is the open empirical question. The defensible takeaway: 'always helps' is not a safe default.",
+   "lab_file": "agora_output/aiclaims/20260620_reranker_not_free_boost.py",
+   "code_url": "https://github.com/DanceNitra/agora/blob/main/agora_output/aiclaims/20260620_reranker_not_free_boost.py",
+   "code_resolves": true,
+   "date": "2026-06-20",
+   "key": "FI-0059"
+  }
+ ]
+}

folklore_index-0.1.0/folklore_index.egg-info/PKG-INFO ADDED Viewed

@@ -0,0 +1,23 @@
+Metadata-Version: 2.4
+Name: folklore-index
+Version: 0.1.0
+Summary: A runnable benchmark of widely-repeated AI / data-science claims, each ruled REPRODUCED / FAILED / NOT_COMPUTABLE.
+Author: Agora (autonomous research organization)
+License: CC-BY-4.0
+Project-URL: Homepage, https://dancenitra.github.io/agora/public/crucible/
+Project-URL: Source, https://github.com/DanceNitra/agora
+Keywords: ai,llm,evaluation,replication,reproducibility,benchmark,rag,agents,folklore,verification
+Classifier: Programming Language :: Python :: 3
+Classifier: Intended Audience :: Science/Research
+Requires-Python: >=3.8
+Description-Content-Type: text/markdown
+# folklore-index
+A standing, machine-readable benchmark of widely-repeated AI / data-science claims, each rebuilt as the smallest runnable test and ruled REPRODUCED / FAILED / NOT_COMPUTABLE. Honest, citable receipts for the field's folklore.
+```python
+import folklore_index as fi
+fi.verdicts()
+fi.get('FI-0001')
+```

folklore_index-0.1.0/folklore_index.egg-info/SOURCES.txt ADDED Viewed

@@ -0,0 +1,8 @@
+README.md
+pyproject.toml
+folklore_index/__init__.py
+folklore_index/folklore_index.json
+folklore_index.egg-info/PKG-INFO
+folklore_index.egg-info/SOURCES.txt
+folklore_index.egg-info/dependency_links.txt
+folklore_index.egg-info/top_level.txt

folklore_index-0.1.0/folklore_index.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+

folklore_index-0.1.0/folklore_index.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ folklore_index

folklore_index-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,24 @@
+[build-system]
+requires = ["setuptools>=61"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "folklore-index"
+version = "0.1.0"
+description = "A runnable benchmark of widely-repeated AI / data-science claims, each ruled REPRODUCED / FAILED / NOT_COMPUTABLE."
+readme = "README.md"
+requires-python = ">=3.8"
+license = { text = "CC-BY-4.0" }
+keywords = ["ai", "llm", "evaluation", "replication", "reproducibility", "benchmark", "rag", "agents", "folklore", "verification"]
+authors = [{ name = "Agora (autonomous research organization)" }]
+classifiers = ["Programming Language :: Python :: 3", "Intended Audience :: Science/Research"]
+[project.urls]
+Homepage = "https://dancenitra.github.io/agora/public/crucible/"
+Source = "https://github.com/DanceNitra/agora"
+[tool.setuptools]
+packages = ["folklore_index"]
+[tool.setuptools.package-data]
+folklore_index = ["*.json"]

folklore_index-0.1.0/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0