paperguard 2.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- paperguard-2.0.1/.env.example +13 -0
- paperguard-2.0.1/CHANGELOG.md +350 -0
- paperguard-2.0.1/CITATION.cff +52 -0
- paperguard-2.0.1/CONTRIBUTING.md +69 -0
- paperguard-2.0.1/LICENSE +21 -0
- paperguard-2.0.1/MANIFEST.in +20 -0
- paperguard-2.0.1/PKG-INFO +415 -0
- paperguard-2.0.1/README.md +346 -0
- paperguard-2.0.1/README.zh.md +159 -0
- paperguard-2.0.1/ROADMAP.md +97 -0
- paperguard-2.0.1/SECURITY.md +25 -0
- paperguard-2.0.1/docs/detectors/A1.md +75 -0
- paperguard-2.0.1/docs/detectors/A2.md +75 -0
- paperguard-2.0.1/docs/detectors/A3.md +75 -0
- paperguard-2.0.1/docs/detectors/A5.md +75 -0
- paperguard-2.0.1/docs/detectors/A6.md +75 -0
- paperguard-2.0.1/docs/detectors/A7.md +75 -0
- paperguard-2.0.1/docs/detectors/B1.md +75 -0
- paperguard-2.0.1/docs/detectors/B4.md +75 -0
- paperguard-2.0.1/docs/detectors/B5.md +75 -0
- paperguard-2.0.1/docs/detectors/B6.md +75 -0
- paperguard-2.0.1/docs/detectors/B7.md +75 -0
- paperguard-2.0.1/docs/detectors/B8.md +75 -0
- paperguard-2.0.1/docs/detectors/C1.md +75 -0
- paperguard-2.0.1/docs/detectors/D1.md +75 -0
- paperguard-2.0.1/docs/detectors/D2.md +75 -0
- paperguard-2.0.1/docs/detectors/F1.md +75 -0
- paperguard-2.0.1/docs/detectors/F2.md +75 -0
- paperguard-2.0.1/docs/detectors/F3.md +75 -0
- paperguard-2.0.1/docs/detectors/F4.md +75 -0
- paperguard-2.0.1/docs/detectors/F5.md +75 -0
- paperguard-2.0.1/docs/detectors/G1.md +75 -0
- paperguard-2.0.1/docs/detectors/G3.md +75 -0
- paperguard-2.0.1/docs/detectors/G4.md +75 -0
- paperguard-2.0.1/docs/detectors/README.md +64 -0
- paperguard-2.0.1/docs/detectors/T1.md +75 -0
- paperguard-2.0.1/docs/detectors/T2.md +75 -0
- paperguard-2.0.1/docs/detectors/T3.md +75 -0
- paperguard-2.0.1/docs/detectors/T4.md +75 -0
- paperguard-2.0.1/docs/detectors/T5.md +75 -0
- paperguard-2.0.1/docs/detectors/T6.md +75 -0
- paperguard-2.0.1/docs/detectors.md +242 -0
- paperguard-2.0.1/docs/epistemic_position.md +106 -0
- paperguard-2.0.1/docs/fraud_case_studies.md +219 -0
- paperguard-2.0.1/docs/webui_multitenant.md +215 -0
- paperguard-2.0.1/pyproject.toml +106 -0
- paperguard-2.0.1/setup.cfg +4 -0
- paperguard-2.0.1/src/paperguard/__init__.py +3 -0
- paperguard-2.0.1/src/paperguard/__main__.py +5 -0
- paperguard-2.0.1/src/paperguard/cli.py +994 -0
- paperguard-2.0.1/src/paperguard/config.py +36 -0
- paperguard-2.0.1/src/paperguard/core/__init__.py +0 -0
- paperguard-2.0.1/src/paperguard/core/audit.py +45 -0
- paperguard-2.0.1/src/paperguard/core/base_detector.py +69 -0
- paperguard-2.0.1/src/paperguard/core/registry.py +164 -0
- paperguard-2.0.1/src/paperguard/core/types.py +107 -0
- paperguard-2.0.1/src/paperguard/detectors/__init__.py +0 -0
- paperguard-2.0.1/src/paperguard/detectors/a1_terminal_digit.py +126 -0
- paperguard-2.0.1/src/paperguard/detectors/a2_benford.py +175 -0
- paperguard-2.0.1/src/paperguard/detectors/a3_arithmetic.py +167 -0
- paperguard-2.0.1/src/paperguard/detectors/a5_decimal_consistency.py +126 -0
- paperguard-2.0.1/src/paperguard/detectors/a6_implausible_values.py +182 -0
- paperguard-2.0.1/src/paperguard/detectors/a7_last_digit_five_zero.py +133 -0
- paperguard-2.0.1/src/paperguard/detectors/b1_grim.py +128 -0
- paperguard-2.0.1/src/paperguard/detectors/b4_statcheck.py +389 -0
- paperguard-2.0.1/src/paperguard/detectors/b5_tiva.py +122 -0
- paperguard-2.0.1/src/paperguard/detectors/b6_grimmer.py +181 -0
- paperguard-2.0.1/src/paperguard/detectors/b7_pcurve.py +148 -0
- paperguard-2.0.1/src/paperguard/detectors/b8_sprite.py +180 -0
- paperguard-2.0.1/src/paperguard/detectors/c1_carlisle.py +220 -0
- paperguard-2.0.1/src/paperguard/detectors/d1_residual_smoothness.py +159 -0
- paperguard-2.0.1/src/paperguard/detectors/d2_missing_pattern.py +126 -0
- paperguard-2.0.1/src/paperguard/detectors/f1_image_duplication.py +125 -0
- paperguard-2.0.1/src/paperguard/detectors/f2_internal_duplication.py +186 -0
- paperguard-2.0.1/src/paperguard/detectors/f3_splice_forensics.py +194 -0
- paperguard-2.0.1/src/paperguard/detectors/f4_cross_paper_image.py +201 -0
- paperguard-2.0.1/src/paperguard/detectors/f5_exif_clustering.py +183 -0
- paperguard-2.0.1/src/paperguard/detectors/g1_exif_temporal.py +271 -0
- paperguard-2.0.1/src/paperguard/detectors/g3_rsid_forensics.py +140 -0
- paperguard-2.0.1/src/paperguard/detectors/g4_metadata_forensics.py +344 -0
- paperguard-2.0.1/src/paperguard/detectors/m1_paper_mill_graph.py +295 -0
- paperguard-2.0.1/src/paperguard/detectors/t1_text_similarity.py +137 -0
- paperguard-2.0.1/src/paperguard/detectors/t2_trial_consistency.py +123 -0
- paperguard-2.0.1/src/paperguard/detectors/t3_data_availability.py +274 -0
- paperguard-2.0.1/src/paperguard/detectors/t4_tortured_phrases.py +291 -0
- paperguard-2.0.1/src/paperguard/detectors/t5_stylometry.py +158 -0
- paperguard-2.0.1/src/paperguard/detectors/t6_ai_text_heuristic.py +213 -0
- paperguard-2.0.1/src/paperguard/evidence/__init__.py +0 -0
- paperguard-2.0.1/src/paperguard/evidence/combiner.py +97 -0
- paperguard-2.0.1/src/paperguard/extractor/__init__.py +0 -0
- paperguard-2.0.1/src/paperguard/extractor/baseline_tables.py +258 -0
- paperguard-2.0.1/src/paperguard/extractor/docx_tables.py +89 -0
- paperguard-2.0.1/src/paperguard/extractor/excel.py +26 -0
- paperguard-2.0.1/src/paperguard/extractor/images.py +88 -0
- paperguard-2.0.1/src/paperguard/extractor/inline_numbers.py +105 -0
- paperguard-2.0.1/src/paperguard/extractor/metadata.py +24 -0
- paperguard-2.0.1/src/paperguard/extractor/pdf_text.py +65 -0
- paperguard-2.0.1/src/paperguard/extractor/trial_ids.py +34 -0
- paperguard-2.0.1/src/paperguard/fetcher/__init__.py +0 -0
- paperguard-2.0.1/src/paperguard/fetcher/cache.py +94 -0
- paperguard-2.0.1/src/paperguard/fetcher/citation_graph.py +150 -0
- paperguard-2.0.1/src/paperguard/fetcher/clinicaltrials.py +60 -0
- paperguard-2.0.1/src/paperguard/fetcher/crossref.py +65 -0
- paperguard-2.0.1/src/paperguard/fetcher/openalex.py +129 -0
- paperguard-2.0.1/src/paperguard/fetcher/ori_sanctions.py +77 -0
- paperguard-2.0.1/src/paperguard/fetcher/pubmed.py +79 -0
- paperguard-2.0.1/src/paperguard/fetcher/pubpeer.py +88 -0
- paperguard-2.0.1/src/paperguard/fetcher/retraction_watch.py +55 -0
- paperguard-2.0.1/src/paperguard/fetcher/unpaywall.py +58 -0
- paperguard-2.0.1/src/paperguard/i18n.py +170 -0
- paperguard-2.0.1/src/paperguard/llm/__init__.py +0 -0
- paperguard-2.0.1/src/paperguard/llm/explainer.py +182 -0
- paperguard-2.0.1/src/paperguard/reporter/__init__.py +0 -0
- paperguard-2.0.1/src/paperguard/reporter/html_export.py +209 -0
- paperguard-2.0.1/src/paperguard/reporter/json_export.py +17 -0
- paperguard-2.0.1/src/paperguard/reporter/schema.py +31 -0
- paperguard-2.0.1/src/paperguard/reporter/terminal.py +85 -0
- paperguard-2.0.1/src/paperguard/utils/__init__.py +0 -0
- paperguard-2.0.1/src/paperguard/utils/float_utils.py +43 -0
- paperguard-2.0.1/src/paperguard/utils/hash.py +19 -0
- paperguard-2.0.1/src/paperguard/webui/__init__.py +0 -0
- paperguard-2.0.1/src/paperguard/webui/admin_bootstrap.py +54 -0
- paperguard-2.0.1/src/paperguard/webui/app.py +244 -0
- paperguard-2.0.1/src/paperguard/webui/db.py +89 -0
- paperguard-2.0.1/src/paperguard/webui/deps.py +67 -0
- paperguard-2.0.1/src/paperguard/webui/models.py +175 -0
- paperguard-2.0.1/src/paperguard/webui/routes_app.py +291 -0
- paperguard-2.0.1/src/paperguard/webui/routes_auth.py +138 -0
- paperguard-2.0.1/src/paperguard/webui/security.py +120 -0
- paperguard-2.0.1/src/paperguard/webui/templates.py +360 -0
- paperguard-2.0.1/src/paperguard.egg-info/PKG-INFO +415 -0
- paperguard-2.0.1/src/paperguard.egg-info/SOURCES.txt +152 -0
- paperguard-2.0.1/src/paperguard.egg-info/dependency_links.txt +1 -0
- paperguard-2.0.1/src/paperguard.egg-info/entry_points.txt +2 -0
- paperguard-2.0.1/src/paperguard.egg-info/requires.txt +50 -0
- paperguard-2.0.1/src/paperguard.egg-info/top_level.txt +1 -0
- paperguard-2.0.1/tests/fixtures/fabricated_geng_style.csv +71 -0
- paperguard-2.0.1/tests/fixtures/generate_fabricated.py +64 -0
- paperguard-2.0.1/tests/fixtures/generate_genuine.py +53 -0
- paperguard-2.0.1/tests/fixtures/genuine_random.csv +71 -0
- paperguard-2.0.1/tests/test_carlisle_auto.py +135 -0
- paperguard-2.0.1/tests/test_cli_extras.py +71 -0
- paperguard-2.0.1/tests/test_coverage_boosters.py +172 -0
- paperguard-2.0.1/tests/test_docx_features.py +108 -0
- paperguard-2.0.1/tests/test_e2e.py +50 -0
- paperguard-2.0.1/tests/test_evidence_combiner.py +43 -0
- paperguard-2.0.1/tests/test_golden.py +68 -0
- paperguard-2.0.1/tests/test_html_and_batch.py +81 -0
- paperguard-2.0.1/tests/test_i18n.py +73 -0
- paperguard-2.0.1/tests/test_i18n_extra.py +34 -0
- paperguard-2.0.1/tests/test_llm_explainer.py +46 -0
- paperguard-2.0.1/tests/test_plugin_registry.py +81 -0
- paperguard-2.0.1/tests/test_webui.py +82 -0
- paperguard-2.0.1/tests/test_webui_multitenant.py +476 -0
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
# Email for "polite pool" of free APIs (OpenAlex, CrossRef, Unpaywall)
|
|
2
|
+
# Replace with your own email - this is REQUIRED for higher rate limits
|
|
3
|
+
PAPERGUARD_EMAIL=your.email@example.com
|
|
4
|
+
|
|
5
|
+
# Optional: NCBI API key for higher PubMed rate limits
|
|
6
|
+
# Get free at https://www.ncbi.nlm.nih.gov/account/
|
|
7
|
+
NCBI_API_KEY=
|
|
8
|
+
|
|
9
|
+
# Cache directory
|
|
10
|
+
PAPERGUARD_CACHE_DIR=~/.paperguard/cache
|
|
11
|
+
|
|
12
|
+
# Default RNG seed for reproducibility
|
|
13
|
+
PAPERGUARD_SEED=42
|
|
@@ -0,0 +1,350 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to PaperGuard are documented in this file. Format follows
|
|
4
|
+
[Keep a Changelog](https://keepachangelog.com/) and the project adheres to
|
|
5
|
+
[Semantic Versioning](https://semver.org/).
|
|
6
|
+
|
|
7
|
+
## [2.0.1] — 2026-05-19 — macOS arm64 CI fix
|
|
8
|
+
|
|
9
|
+
### Fixed
|
|
10
|
+
- **`greenlet>=3.0`** added explicitly to the `[webui]` and `[dev]`
|
|
11
|
+
extras. SQLAlchemy 2.x needs `greenlet` for its async-to-sync
|
|
12
|
+
bridge; it is normally a transitive dep on Linux and Windows, but
|
|
13
|
+
the macOS arm64 wheel marks it optional in some versions, which
|
|
14
|
+
caused all 20 multi-tenant tests to error on the macos-latest
|
|
15
|
+
CI matrix. Local `pip install -e .[dev]` on macOS now pulls it in.
|
|
16
|
+
- No source-code changes; only the dependency declaration moved.
|
|
17
|
+
|
|
18
|
+
## [2.0.0] — 2026-05-19 — Paper-mill graph + Carlisle automation + multi-tenant Web UI
|
|
19
|
+
|
|
20
|
+
Second stable major. Folds in the two dev-1 shipped items (M1 paper-mill
|
|
21
|
+
citation-graph signatures; deeper Carlisle automation including multi-arm
|
|
22
|
+
RCT support and PDF→C1 trial-ID auto-extraction) and adds an opt-in
|
|
23
|
+
multi-tenant Web UI surface. 223 tests passing, mypy strict clean,
|
|
24
|
+
30 built-in detectors plus plugin entry-point.
|
|
25
|
+
|
|
26
|
+
### Added (multi-tenant Web UI)
|
|
27
|
+
- **`/app/*` invite-only multi-tenant surface** mounted on top of the
|
|
28
|
+
existing anonymous `/scan` endpoints. Activated by setting
|
|
29
|
+
`PAPERGUARD_DB_URL` or `PAPERGUARD_MULTITENANT=1`; otherwise no behaviour
|
|
30
|
+
changes from 1.x.
|
|
31
|
+
- **User + InviteCode + Project + ScanReport ORM** (SQLAlchemy 2.0 async).
|
|
32
|
+
SQLite by default; any async engine (PostgreSQL via `asyncpg`, MySQL via
|
|
33
|
+
`aiomysql`) via `PAPERGUARD_DB_URL`.
|
|
34
|
+
- **Per-report visibility**: `private` / `org` / `public`. Public reports
|
|
35
|
+
are listed at `/app/shared` and readable anonymously.
|
|
36
|
+
- **Admin invite flow**: admins mint single-use codes at
|
|
37
|
+
`/app/admin/invites`; invitees redeem at `/app/redeem/{code}` with a
|
|
38
|
+
password ≥ 10 characters.
|
|
39
|
+
- **Bootstrap admin from env**: `PAPERGUARD_ADMIN_EMAIL` +
|
|
40
|
+
`PAPERGUARD_ADMIN_PASSWORD` create the first admin idempotently on
|
|
41
|
+
startup.
|
|
42
|
+
- **Sessions** in HttpOnly, SameSite=Lax cookies signed with
|
|
43
|
+
`itsdangerous` (key from `PAPERGUARD_SECRET_KEY`); 14-day TTL.
|
|
44
|
+
- **Passwords** hashed with `bcrypt` directly (no passlib; bcrypt 4.x/5.x
|
|
45
|
+
compatible).
|
|
46
|
+
- New optional dependencies in `paperguard[webui]`: `sqlalchemy>=2.0`,
|
|
47
|
+
`aiosqlite>=0.20`, `bcrypt>=4.0`, `itsdangerous>=2.2`.
|
|
48
|
+
- **24 new tests** covering bootstrap, auth, invite redemption (single-use,
|
|
49
|
+
weak-password rejection, email binding), project isolation, visibility
|
|
50
|
+
enforcement (private / org / public), legacy `/scan` survival,
|
|
51
|
+
multi-tenant off-by-default, and session signing.
|
|
52
|
+
- New documentation: `docs/webui_multitenant.md` with architecture,
|
|
53
|
+
env-var reference, invite flow, visibility semantics, and production
|
|
54
|
+
checklist.
|
|
55
|
+
|
|
56
|
+
### Added (Carlisle automation deepening)
|
|
57
|
+
- **C1 multi-arm RCT support**: `BaselineVariable.arms` accepts any number
|
|
58
|
+
of `(n, mean, sd)` arms. Pairwise Welch t between every arm pair;
|
|
59
|
+
per-variable median pairwise p folded into the Stouffer combination.
|
|
60
|
+
Backward-compatible (legacy `n1/mean1/sd1 + n2/mean2/sd2` still works).
|
|
61
|
+
- **Robust baseline-table parser**: handles 3+ arm tables, header-embedded
|
|
62
|
+
N (`Treatment (n=42)`), categorical `n (%)` rows recorded separately,
|
|
63
|
+
multiple table caption formats (`baseline`, `demographics`, `study
|
|
64
|
+
population`, `participant characteristics`, `table 1/i`).
|
|
65
|
+
- **Auto-extract trial registration IDs** from PDF/docx text
|
|
66
|
+
(`extractor/trial_ids.py`): supports NCT, ISRCTN, ChiCTR, ACTRN,
|
|
67
|
+
EudraCT, DRKS. CLI surfaces found IDs in scan output.
|
|
68
|
+
- 14 new tests for multi-arm Welch, trial-ID extraction, arm-column
|
|
69
|
+
identification, mean±SD / paren / categorical row parsing.
|
|
70
|
+
|
|
71
|
+
### Added (M1 paper-mill citation graph)
|
|
72
|
+
- **M1 — Paper-Mill Citation Graph Signatures** detector
|
|
73
|
+
(`detectors/m1_paper_mill_graph.py`).
|
|
74
|
+
- **Citation-graph fetcher** (`fetcher/citation_graph.py`).
|
|
75
|
+
- **`--check-paper-mill` flag** on `paperguard scan`.
|
|
76
|
+
- 5 unit tests against synthetic citation subgraphs.
|
|
77
|
+
- New core dependency: `networkx>=3.0`.
|
|
78
|
+
|
|
79
|
+
### Planned for 2.x (still open)
|
|
80
|
+
- Full Cabanac 2025 PDCN model on a 5M-node citation graph (the M1
|
|
81
|
+
detector is the local-subgraph version of the same signatures)
|
|
82
|
+
- Reviewer-fraud signal extraction (no public data source yet)
|
|
83
|
+
- ML-trained Western-blot specific image model (requires labeled corpus)
|
|
84
|
+
- Multi-tenant Web UI with shared scan history
|
|
85
|
+
|
|
86
|
+
## [1.0.0] — 2026-05-18 — First stable release
|
|
87
|
+
|
|
88
|
+
### Added (since 0.9.0)
|
|
89
|
+
- **PDF→C1 auto baseline extraction** (`extractor/baseline_tables.py`)
|
|
90
|
+
— CLI now automatically scans PDFs for baseline characteristics tables
|
|
91
|
+
and feeds them to C1 Carlisle.
|
|
92
|
+
- **Expanded T4 dictionary** — tortured phrases went from ~50 to **150+**
|
|
93
|
+
curated entries covering CS/ML, statistics, optimization, biomedical,
|
|
94
|
+
energy, common terms, and GPT-disguise phrases.
|
|
95
|
+
- **`paperguard server`** — production-mode daemon with X-API-Token auth,
|
|
96
|
+
multi-worker, `/health` endpoint.
|
|
97
|
+
- **`docs/detectors/`** — 29 deep-dive markdown pages auto-generated from
|
|
98
|
+
source (one per detector + index README).
|
|
99
|
+
- **`scripts/generate_detector_docs.py`** — regenerate detector docs from
|
|
100
|
+
source metadata.
|
|
101
|
+
- **GitHub Actions enhancements**: codecov upload, dependabot config,
|
|
102
|
+
release-please workflow.
|
|
103
|
+
- **Coverage badge** (73% on tracked source files).
|
|
104
|
+
- **`README.zh.md`** finalized.
|
|
105
|
+
|
|
106
|
+
### Changed
|
|
107
|
+
- Status promoted from Alpha to Beta in PyPI classifiers (the project
|
|
108
|
+
is now stable enough for first-real-users).
|
|
109
|
+
|
|
110
|
+
### Decided NOT to do in 1.0
|
|
111
|
+
- **Extras split** (`paperguard[image]` / `paperguard[text]` / etc.) —
|
|
112
|
+
current dependencies are all small enough that the additional
|
|
113
|
+
packaging complexity doesn't pay for itself. May revisit in 2.0
|
|
114
|
+
if dependency tree grows.
|
|
115
|
+
|
|
116
|
+
### Post-1.0 polish
|
|
117
|
+
|
|
118
|
+
- **Fetcher disk-cache** (`src/paperguard/fetcher/cache.py`) — OpenAlex
|
|
119
|
+
/ CrossRef / Unpaywall responses cached to `diskcache` for 7 days,
|
|
120
|
+
reducing API load on repeat scans.
|
|
121
|
+
- **`paperguard list-detectors`** — table / json / ids output, optional
|
|
122
|
+
`--cluster` filter.
|
|
123
|
+
- **`paperguard fetch-rw`** — downloads the Retraction Watch CSV.
|
|
124
|
+
- **`paperguard fetch-ori`** — writes a starter ORI sanctions template.
|
|
125
|
+
- **`docker-compose.yml`** — production reference deployment.
|
|
126
|
+
- **`MANIFEST.in`** — ensures docs and fixtures ship in the sdist.
|
|
127
|
+
- **`examples/plugin_example/`** — fully installable plugin template
|
|
128
|
+
with entry-point wiring.
|
|
129
|
+
- **`paperguard.reporter.schema`** — emits a JSON Schema describing the
|
|
130
|
+
audit-report shape.
|
|
131
|
+
- **Coverage** 73% → **74%** with 16 new boost tests (180 total).
|
|
132
|
+
|
|
133
|
+
## [0.9.0] — 2026-05-18 — Polish + 4 specialized detectors
|
|
134
|
+
|
|
135
|
+
### Added detectors (4 new, 25 → 29)
|
|
136
|
+
- **A7 Last-Digit 0/5 Preference (Geng method)** — Binomial test on
|
|
137
|
+
P(末位 ∈ {0,5}) ≠ 0.2; specialized refinement of A1 χ², bidirectional
|
|
138
|
+
(catches both excess and depression). Direct internalization of the
|
|
139
|
+
2025 Geng Hongwei method.
|
|
140
|
+
- **B8 SPRITE plausibility** — Heathers et al. (2018) iterative
|
|
141
|
+
reconstruction: given (mean, SD, N, scale_min, scale_max), tries to
|
|
142
|
+
construct any valid integer sample; failure → SUSPICIOUS.
|
|
143
|
+
- **F5 EXIF Cross-Image Clustering** — multi-image consistency: span
|
|
144
|
+
> 5 years, > 2 distinct camera models, or identical second-precision
|
|
145
|
+
timestamps across ≥ 3 images.
|
|
146
|
+
- **T6 AI-Generated Text Heuristic** — two-layer detection:
|
|
147
|
+
uncleaned LLM response leakage (CRITICAL) + AI-overused phrase
|
|
148
|
+
density (Kobak 2025 word patterns).
|
|
149
|
+
|
|
150
|
+
### Added fetchers
|
|
151
|
+
- **`fetcher/openalex.py`** — new `get_author_retraction_rate(author_id)`
|
|
152
|
+
using OpenAlex `is_retracted` flag (Retraction Watch–synced upstream).
|
|
153
|
+
- **`fetcher/pubmed.py`** — Biopython `Entrez` wrapper for PMID lookup
|
|
154
|
+
and DOI→PMID resolution. Saves us re-implementing E-utils.
|
|
155
|
+
|
|
156
|
+
### Added documentation
|
|
157
|
+
- **`README.zh.md`** — 中文 README full translation.
|
|
158
|
+
- `docs/fraud_case_studies.md` (from 0.8.0) — cross-referenced.
|
|
159
|
+
|
|
160
|
+
### Tests
|
|
161
|
+
- **`tests/test_golden.py`** — anti-regression gate: golden findings
|
|
162
|
+
count on the paired fixtures.
|
|
163
|
+
- 12 new tests for A7/B8/F5/T6.
|
|
164
|
+
- 164 total passing.
|
|
165
|
+
|
|
166
|
+
### Reused open-source components
|
|
167
|
+
- **biopython** for NCBI Entrez (vs hand-rolled E-utils HTTP).
|
|
168
|
+
- imagehash, pdfplumber, pymupdf, opencv-python-headless, PIL, piexif:
|
|
169
|
+
all already in use. No new heavyweight dependencies in 0.9.0.
|
|
170
|
+
|
|
171
|
+
## [0.8.0] — 2026-05-18 — Real-world fraud case internalization
|
|
172
|
+
|
|
173
|
+
### Added (6 new detectors derived from real-case study)
|
|
174
|
+
- **A6 Implausible Values** — column-name-aware range checks + sentinel
|
|
175
|
+
values (999 / -999) detection. Internalizes Wansink's "700 pizza slices".
|
|
176
|
+
- **B7 P-Curve** (Simonsohn 2014) — p-curve shape analysis; left-skew or
|
|
177
|
+
near-α pile-up signals p-hacking. Internalizes Wansink's email-leaked
|
|
178
|
+
data-mining patterns.
|
|
179
|
+
- **D1 Residual Smoothness** — block-variance stability check;
|
|
180
|
+
internalizes the Stapel-case signature of "too clean" data.
|
|
181
|
+
- **D2 Missing-Data Pattern** — flags 0-missing datasets with low
|
|
182
|
+
column-σ variation; internalizes Carlisle's RCT-fraud observation.
|
|
183
|
+
- **F4 Cross-Paper Image Duplication** — persistent SQLite pHash store
|
|
184
|
+
for cross-paper image reuse. Internalizes Masliah (2024) and Hwang
|
|
185
|
+
(2005) findings.
|
|
186
|
+
- **T5 Stylometry** — Markowitz-Hancock 2014 PLOS ONE linguistic
|
|
187
|
+
fingerprint (methodology / certainty / adjective density ratios).
|
|
188
|
+
Internalizes the Stapel-text findings.
|
|
189
|
+
|
|
190
|
+
### Documentation
|
|
191
|
+
- **`docs/fraud_case_studies.md`** — Stapel, Fujii, Hwang, Schön,
|
|
192
|
+
Macchiarini, Wansink, Masliah, Geng-targets, Bik 2016: each case maps
|
|
193
|
+
to specific detectors with honest "would catch" vs "cannot catch"
|
|
194
|
+
assessment.
|
|
195
|
+
|
|
196
|
+
### Detector count
|
|
197
|
+
- Built-in: 25 (was 19). Added A6, B7, D1, D2, F4, T5.
|
|
198
|
+
|
|
199
|
+
### Tests
|
|
200
|
+
- 152 passing (was 138). Added 13 new tests for the 6 new detectors.
|
|
201
|
+
|
|
202
|
+
## [0.7.0] — 2026-05-18
|
|
203
|
+
|
|
204
|
+
### Added
|
|
205
|
+
- **B5 TIVA** (Schimmack 2014) — z-score variance test on a set of independent
|
|
206
|
+
study p-values; insufficient variance → potential p-hacking / selective
|
|
207
|
+
reporting.
|
|
208
|
+
- **B6 GRIMMER** (Anaya 2016; Allard 2018) — `(mean, SD, N)` triple-consistency
|
|
209
|
+
test. Stricter than B1 GRIM (which only checks mean × N).
|
|
210
|
+
- **T4 Tortured Phrases** (Cabanac 2021) — 50+ machine-translation
|
|
211
|
+
fingerprint phrases ("profound neural organization" → "deep neural network").
|
|
212
|
+
Detects paper-mill / synonym-laundered text.
|
|
213
|
+
- **B4 statcheck upgrades** — added **Q-test** for meta-analysis heterogeneity;
|
|
214
|
+
whole-text one-tailed scan (if "one-tailed/one-sided/单尾" anywhere in the
|
|
215
|
+
manuscript, switch matching t/r/z to one-tailed when that gives consistency).
|
|
216
|
+
- **`paperguard selfcheck`** — runs internal fixtures through all detectors as
|
|
217
|
+
a sanity check on installation.
|
|
218
|
+
- **`paperguard explain`** — LLM-explanation of a specific finding from a JSON
|
|
219
|
+
report (needs `PAPERGUARD_LLM_PROVIDER`).
|
|
220
|
+
- **`paperguard diff before.json after.json`** — track changes between two
|
|
221
|
+
scan reports.
|
|
222
|
+
- **Auto-OA-PDF download** — `paperguard scan --doi X` (without `-f`) now
|
|
223
|
+
attempts to download the OA PDF via Unpaywall.
|
|
224
|
+
- **docs/** — `detectors.md` (per-detector reference) and
|
|
225
|
+
`epistemic_position.md` (vocabulary rule, innocent-explanation rule).
|
|
226
|
+
- **examples/04_full_pipeline_demo.py** — exercise every detector class.
|
|
227
|
+
|
|
228
|
+
### Detector count
|
|
229
|
+
- Built-in: 19 (was 16). Added B5, B6, T4.
|
|
230
|
+
|
|
231
|
+
### Tests
|
|
232
|
+
- 138 passing (was 120). Added: T4 ×5, B5 ×4, B6 ×4, CLI extras ×5.
|
|
233
|
+
|
|
234
|
+
## [0.6.0] — 2026-05-18
|
|
235
|
+
|
|
236
|
+
### Added
|
|
237
|
+
- **T3 — Data Availability + Ethics Audit** detector. Flags missing data
|
|
238
|
+
statements, vague "available on request" without verifiable accessions,
|
|
239
|
+
missing IRB/IACUC, missing trial registration (NCT/ISRCTN/ChiCTR/EudraCT),
|
|
240
|
+
missing competing-interests disclosure.
|
|
241
|
+
- **F3 — Splice / Copy-Move Forensics** detector. Patch-level statistical
|
|
242
|
+
signatures (mean / std / Laplacian variance) with translation-vote
|
|
243
|
+
consistency to find pixel-level cloning that ORB-based F2 misses.
|
|
244
|
+
- **CLI auto-runs T3** on extracted PDF/docx text.
|
|
245
|
+
|
|
246
|
+
### Fixed
|
|
247
|
+
- **PDF image extraction** now filters out tiny embedded bitmaps (math
|
|
248
|
+
symbols, font glyphs) by size (≥ 200×200 px, ≥ 8 KB) and SHA-256-dedups,
|
|
249
|
+
eliminating massive F1 false-positive cascades on typeset PDFs.
|
|
250
|
+
- **G4 publisher-creator whitelist** — Springer / Elsevier / Wiley / LaTeX /
|
|
251
|
+
pdfTeX / Acrobat Distiller / Word / LibreOffice etc. no longer trigger
|
|
252
|
+
the "creator not in authors list" CONCERN. This was a 100% false positive
|
|
253
|
+
on every published PDF.
|
|
254
|
+
|
|
255
|
+
### Tested on
|
|
256
|
+
- 2 real Nature Communications papers (ecology, OA) — both correctly
|
|
257
|
+
classified as PASS with 0 findings.
|
|
258
|
+
|
|
259
|
+
## [0.5.0] — 2026-05-18
|
|
260
|
+
|
|
261
|
+
### Added
|
|
262
|
+
- **F2 — Bik-style internal image duplication** detector. ORB keypoint
|
|
263
|
+
self-matching + RANSAC affine consensus to find copy-pasted patches inside
|
|
264
|
+
a single image. Rotation/scale tolerant.
|
|
265
|
+
- **T1 — Text similarity** detector. 5-gram word-shingling + Jaccard against
|
|
266
|
+
a user-supplied corpus (no network). For self-plagiarism and re-use.
|
|
267
|
+
- **T2 — Clinical-trial outcome consistency** detector. Compares paper's
|
|
268
|
+
reported primary outcomes to ClinicalTrials.gov v2 API registration.
|
|
269
|
+
Catches outcome switching (Goldacre 2019).
|
|
270
|
+
- **ORI sanctions** local CSV lookup (`paperguard.fetcher.ori_sanctions`).
|
|
271
|
+
- **LLM explainer** (opt-in via `PAPERGUARD_LLM_PROVIDER`) — supports
|
|
272
|
+
OpenAI / Anthropic / Ollama. Hard-coded system prompt forbids the LLM
|
|
273
|
+
from claiming fraud or inventing evidence.
|
|
274
|
+
- **Statcheck one-tailed support** — recognizes "one-tailed / 单尾" in the
|
|
275
|
+
reporting context.
|
|
276
|
+
- **i18n: es, ja, de** language packs (now 5 total).
|
|
277
|
+
- **WCAG 2.1 AA** for HTML reports: focus-visible outlines, ARIA roles,
|
|
278
|
+
semantic header/main/footer, `prefers-reduced-motion`, severity colors
|
|
279
|
+
re-tuned for ≥ 4.5:1 contrast on white.
|
|
280
|
+
- New dependency: `opencv-python-headless` (for F2).
|
|
281
|
+
- 21 new tests (133 total).
|
|
282
|
+
|
|
283
|
+
## [0.4.0] — 2026-05-18
|
|
284
|
+
|
|
285
|
+
### Added
|
|
286
|
+
- **Plugin system** — third-party packages can register detectors via
|
|
287
|
+
`paperguard.detectors` entry-point group. `DetectorRegistry.load_plugins()`
|
|
288
|
+
discovers and instantiates them with safe error handling.
|
|
289
|
+
- **i18n** — report framework now supports `en` and `zh-CN` via a lightweight
|
|
290
|
+
dict-backed `t()` helper (no gettext / .po toolchain). `--lang` flag added
|
|
291
|
+
to `scan`. `PAPERGUARD_LANG` env var also honored.
|
|
292
|
+
- **Web UI** (`paperguard webui`) — FastAPI app with upload form, language
|
|
293
|
+
selector, `/detectors` introspection endpoint, and `/scan.json` for
|
|
294
|
+
programmatic use. Available via the `paperguard[webui]` extras.
|
|
295
|
+
- 14 new tests (91 total): i18n, plugin loader (mocked entry points), and
|
|
296
|
+
webui (TestClient).
|
|
297
|
+
|
|
298
|
+
### Changed
|
|
299
|
+
- `paperguard` package now exposes `__version__`.
|
|
300
|
+
- `register_default(load_plugins=True)` is the new default; pass `False`
|
|
301
|
+
to opt-out (used in tests).
|
|
302
|
+
|
|
303
|
+
## [0.3.0] — 2026-05-18
|
|
304
|
+
|
|
305
|
+
### Added
|
|
306
|
+
- **C1 Carlisle** baseline-imbalance detector for RCTs (Welch t per variable + Stouffer combination).
|
|
307
|
+
- **F1 image-duplication** detector via perceptual hash (`imagehash` library).
|
|
308
|
+
- **G1 image EXIF temporal forensics** — flags shooting time before claimed experiment start, after submission, and Photoshop signatures.
|
|
309
|
+
- **G3 docx rsid forensics** — identifies python-docx / pandoc-generated files via missing or homogeneous `w:rsid` values.
|
|
310
|
+
- **Image extractors** for .docx (word/media/) and .pdf (via pymupdf).
|
|
311
|
+
- **Retraction Watch CSV loader** — local lookup against the official dataset (no network).
|
|
312
|
+
- Tests: 19 new (67 total).
|
|
313
|
+
|
|
314
|
+
## [0.2.0] — 2026-05-18
|
|
315
|
+
|
|
316
|
+
### Added
|
|
317
|
+
- **A2 Benford** first-digit detector with applicability gate (≥ 2 decades of range).
|
|
318
|
+
- **B4 statcheck** — recompute reported `t / F / χ² / r / z` p-values from
|
|
319
|
+
manuscript text, flagging decision-reversals as SUSPICIOUS and numeric
|
|
320
|
+
inconsistencies as CONCERN.
|
|
321
|
+
- **PubPeer** client — surfaces existing public comments on a DOI.
|
|
322
|
+
- **PDF text and table extraction** via pymupdf + pdfplumber.
|
|
323
|
+
- **Docx inline-number classification** — extracts and classifies numbers from
|
|
324
|
+
prose (p-values, percentages, mean ± SD, generic decimals).
|
|
325
|
+
- **HTML report export** (`--output-html`) — self-contained styled HTML.
|
|
326
|
+
- **Batch mode** (`paperguard batch --glob 'papers/*.pdf'`) for many files at once.
|
|
327
|
+
- **Dockerfile** for containerized usage.
|
|
328
|
+
- Roadmap, contributing guide, security policy, GitHub Actions CI.
|
|
329
|
+
|
|
330
|
+
### Changed
|
|
331
|
+
- `clean-meta` subcommand removed from PaperGuard; moved to a separate
|
|
332
|
+
standalone tool to keep this project narrowly scoped to detection.
|
|
333
|
+
- `scan` now auto-handles .xlsx, .csv, .tsv, .docx, .pdf — both table data and
|
|
334
|
+
free-text run through detectors as appropriate.
|
|
335
|
+
|
|
336
|
+
### Removed
|
|
337
|
+
- `src/paperguard/utils/docx_meta_writer.py` and the `clean-meta` CLI command.
|
|
338
|
+
Cleanup tooling lives in a separate private repo to avoid coupling
|
|
339
|
+
detection with anti-detection in one shipped product.
|
|
340
|
+
|
|
341
|
+
## [0.1.0] — 2026-05-18
|
|
342
|
+
|
|
343
|
+
### Added
|
|
344
|
+
- Initial MVP release with five detectors: A1 (terminal digit), A3 (inter-column
|
|
345
|
+
arithmetic), A5 (decimal consistency), B1 (GRIM), G4 (file metadata forensics).
|
|
346
|
+
- Click-based CLI with `scan` and `search` subcommands.
|
|
347
|
+
- Rich terminal report + JSON export + immutable audit log.
|
|
348
|
+
- OpenAlex, CrossRef, and Unpaywall clients (`scan --doi` integration).
|
|
349
|
+
- BH–FDR p-value correction and severity escalation (PASS → CRITICAL).
|
|
350
|
+
- 22 tests, full `mypy --strict` + `ruff` clean.
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
cff-version: 1.2.0
|
|
2
|
+
message: "If you use PaperGuard in your research, please cite it."
|
|
3
|
+
title: PaperGuard
|
|
4
|
+
abstract: >
|
|
5
|
+
Statistical anomaly screener for tabular research data. Flags anomalies,
|
|
6
|
+
not fraud. Detectors cover terminal-digit, Benford, inter-column
|
|
7
|
+
arithmetic, decimal consistency, GRIM, statcheck, Carlisle baseline
|
|
8
|
+
balance, image duplication (pHash), EXIF temporal forensics, docx rsid
|
|
9
|
+
forensics, and file metadata forensics.
|
|
10
|
+
authors:
|
|
11
|
+
- name: PaperGuard Contributors
|
|
12
|
+
version: 2.0.0.dev1
|
|
13
|
+
date-released: 2026-05-18
|
|
14
|
+
license: MIT
|
|
15
|
+
keywords:
|
|
16
|
+
- research-integrity
|
|
17
|
+
- scientific-misconduct
|
|
18
|
+
- statistical-forensics
|
|
19
|
+
- data-fabrication
|
|
20
|
+
- peer-review
|
|
21
|
+
repository-code: https://example.com/paperguard
|
|
22
|
+
references:
|
|
23
|
+
- type: article
|
|
24
|
+
authors:
|
|
25
|
+
- family-names: Mosimann
|
|
26
|
+
given-names: J. E.
|
|
27
|
+
title: "Data fabrication: Can people generate random digits?"
|
|
28
|
+
journal: Accountability in Research
|
|
29
|
+
year: 1995
|
|
30
|
+
- type: article
|
|
31
|
+
authors:
|
|
32
|
+
- family-names: Brown
|
|
33
|
+
given-names: N. J. L.
|
|
34
|
+
- family-names: Heathers
|
|
35
|
+
given-names: J. A. J.
|
|
36
|
+
title: "The GRIM Test: A simple technique detects numerous anomalies in the reporting of results in psychology"
|
|
37
|
+
journal: Social Psychological and Personality Science
|
|
38
|
+
year: 2017
|
|
39
|
+
- type: article
|
|
40
|
+
authors:
|
|
41
|
+
- family-names: Nuijten
|
|
42
|
+
given-names: M. B.
|
|
43
|
+
title: "The prevalence of statistical reporting errors in psychology (1985-2013)"
|
|
44
|
+
journal: Behavior Research Methods
|
|
45
|
+
year: 2016
|
|
46
|
+
- type: article
|
|
47
|
+
authors:
|
|
48
|
+
- family-names: Carlisle
|
|
49
|
+
given-names: J. B.
|
|
50
|
+
title: "Data fabrication and other reasons for non-random sampling in 5087 randomised controlled trials"
|
|
51
|
+
journal: Anaesthesia
|
|
52
|
+
year: 2017
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
# Contributing to PaperGuard
|
|
2
|
+
|
|
3
|
+
Thanks for considering a contribution. Most useful contributions are new
|
|
4
|
+
detectors. The codebase makes adding one straightforward.
|
|
5
|
+
|
|
6
|
+
## Setup
|
|
7
|
+
|
|
8
|
+
```bash
|
|
9
|
+
git clone <repo>
|
|
10
|
+
cd PaperGuard
|
|
11
|
+
python -m venv .venv
|
|
12
|
+
# Linux/macOS: source .venv/bin/activate
|
|
13
|
+
# Windows: .\.venv\Scripts\Activate.ps1
|
|
14
|
+
pip install -e ".[dev]" types-openpyxl
|
|
15
|
+
pre-commit install # optional
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
Run the validation suite before any PR:
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
pytest -m "not network" -v
|
|
22
|
+
ruff check src/ tests/
|
|
23
|
+
mypy src/
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
## Adding a new detector
|
|
27
|
+
|
|
28
|
+
Use `src/paperguard/detectors/a1_terminal_digit.py` as the canonical template.
|
|
29
|
+
Every detector must:
|
|
30
|
+
|
|
31
|
+
1. Subclass `BaseDetector`.
|
|
32
|
+
2. Define class-level `id`, `name`, `description`, `academic_basis`,
|
|
33
|
+
`data_requirements`, `assumption_cluster` (all as `ClassVar`).
|
|
34
|
+
3. Implement `check_applicability(data) -> tuple[bool, str]` — return
|
|
35
|
+
`(False, reason)` rather than raising when data doesn't fit.
|
|
36
|
+
4. Implement `_detect(data, seed) -> list[Finding]`.
|
|
37
|
+
5. Register the detector in
|
|
38
|
+
`src/paperguard/core/registry.py:DetectorRegistry.register_default()`.
|
|
39
|
+
|
|
40
|
+
Each `Finding` must include at least three `innocent_explanations`. This is
|
|
41
|
+
non-negotiable — the epistemic posture of the tool depends on it.
|
|
42
|
+
|
|
43
|
+
## Tests
|
|
44
|
+
|
|
45
|
+
For each detector, add a test file under `tests/test_detectors/`. At minimum:
|
|
46
|
+
|
|
47
|
+
- One test that confirms the detector flags `fabricated_data` at
|
|
48
|
+
`CONCERN` or higher.
|
|
49
|
+
- One test that confirms it does not flag `genuine_data` at `SUSPICIOUS+`.
|
|
50
|
+
- One test for inapplicability (e.g., wrong data type or too-small N).
|
|
51
|
+
|
|
52
|
+
Tests requiring network calls must be marked with `@pytest.mark.network` so
|
|
53
|
+
CI can skip them.
|
|
54
|
+
|
|
55
|
+
## Code style
|
|
56
|
+
|
|
57
|
+
- Python ≥ 3.11. Use `X | Y` not `Union[X, Y]`; `list[X]` not `List[X]`.
|
|
58
|
+
- Type hint every public function. We run `mypy --strict`.
|
|
59
|
+
- Use pydantic v2 for data validation; `click` for CLI; `rich` for terminal.
|
|
60
|
+
- Avoid hard-coding secrets. New API clients should read credentials via
|
|
61
|
+
`pydantic-settings` in `config.py`.
|
|
62
|
+
|
|
63
|
+
## Epistemic rules (not negotiable)
|
|
64
|
+
|
|
65
|
+
The tool never outputs the words "fraud", "造假", "misconduct",
|
|
66
|
+
"cheating" in any report. Use "anomaly", "statistical inconsistency",
|
|
67
|
+
"unexplained pattern". Every report ends with the standard disclaimer.
|
|
68
|
+
|
|
69
|
+
A finding without `innocent_explanations` is a bug.
|
paperguard-2.0.1/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 PaperGuard Contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
include LICENSE
|
|
2
|
+
include README.md
|
|
3
|
+
include README.zh.md
|
|
4
|
+
include CHANGELOG.md
|
|
5
|
+
include CITATION.cff
|
|
6
|
+
include CONTRIBUTING.md
|
|
7
|
+
include SECURITY.md
|
|
8
|
+
include ROADMAP.md
|
|
9
|
+
include pyproject.toml
|
|
10
|
+
include .env.example
|
|
11
|
+
|
|
12
|
+
recursive-include docs *.md
|
|
13
|
+
recursive-include src/paperguard *.py
|
|
14
|
+
recursive-include tests/fixtures *.csv *.py
|
|
15
|
+
|
|
16
|
+
global-exclude __pycache__
|
|
17
|
+
global-exclude *.py[cod]
|
|
18
|
+
global-exclude *.so
|
|
19
|
+
global-exclude .DS_Store
|
|
20
|
+
global-exclude *.bak
|