codedoc-ai 0.8.0__tar.gz → 0.9.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. {codedoc_ai-0.8.0 → codedoc_ai-0.9.1}/CHANGELOG.md +250 -0
  2. {codedoc_ai-0.8.0/codedoc_ai.egg-info → codedoc_ai-0.9.1}/PKG-INFO +98 -11
  3. {codedoc_ai-0.8.0 → codedoc_ai-0.9.1}/README.md +97 -10
  4. {codedoc_ai-0.8.0 → codedoc_ai-0.9.1}/RUN_FLOW.md +27 -4
  5. {codedoc_ai-0.8.0 → codedoc_ai-0.9.1}/codedoc/__init__.py +1 -1
  6. {codedoc_ai-0.8.0 → codedoc_ai-0.9.1}/codedoc/agents/base_agent.py +11 -11
  7. {codedoc_ai-0.8.0 → codedoc_ai-0.9.1}/codedoc/agents/dependency_agent.py +36 -36
  8. {codedoc_ai-0.8.0 → codedoc_ai-0.9.1}/codedoc/agents/documentation_agent.py +10 -10
  9. {codedoc_ai-0.8.0 → codedoc_ai-0.9.1}/codedoc/agents/orchestrator.py +48 -8
  10. {codedoc_ai-0.8.0 → codedoc_ai-0.9.1}/codedoc/agents/structure_agent.py +10 -10
  11. {codedoc_ai-0.8.0 → codedoc_ai-0.9.1}/codedoc/cli/cli.py +70 -24
  12. {codedoc_ai-0.8.0 → codedoc_ai-0.9.1}/codedoc/core/graph.py +9 -2
  13. codedoc_ai-0.9.1/codedoc/core/loader.py +618 -0
  14. {codedoc_ai-0.8.0 → codedoc_ai-0.9.1}/codedoc/core/output.py +50 -0
  15. {codedoc_ai-0.8.0 → codedoc_ai-0.9.1}/codedoc/core/project_view.py +245 -14
  16. {codedoc_ai-0.8.0 → codedoc_ai-0.9.1}/codedoc/core/safe_writer.py +29 -0
  17. codedoc_ai-0.9.1/codedoc/core/scanner.py +270 -0
  18. {codedoc_ai-0.8.0 → codedoc_ai-0.9.1}/codedoc/llm/factory.py +63 -29
  19. codedoc_ai-0.9.1/codedoc/llm/rate_limit_profile.py +192 -0
  20. codedoc_ai-0.9.1/codedoc/parser/generic_parser.py +241 -0
  21. {codedoc_ai-0.8.0 → codedoc_ai-0.9.1}/codedoc/pipeline.py +257 -45
  22. {codedoc_ai-0.8.0 → codedoc_ai-0.9.1}/codedoc/utils/logger.py +12 -1
  23. {codedoc_ai-0.8.0 → codedoc_ai-0.9.1/codedoc_ai.egg-info}/PKG-INFO +98 -11
  24. {codedoc_ai-0.8.0 → codedoc_ai-0.9.1}/codedoc_ai.egg-info/SOURCES.txt +7 -0
  25. {codedoc_ai-0.8.0 → codedoc_ai-0.9.1}/pyproject.toml +7 -1
  26. codedoc_ai-0.9.1/tests/conftest.py +30 -0
  27. {codedoc_ai-0.8.0 → codedoc_ai-0.9.1}/tests/test_080_features.py +6 -2
  28. codedoc_ai-0.9.1/tests/test_081_configurable_defaults.py +704 -0
  29. codedoc_ai-0.9.1/tests/test_081_lossless_md.py +1051 -0
  30. codedoc_ai-0.9.1/tests/test_081_placeholder.py +500 -0
  31. codedoc_ai-0.9.1/tests/test_081_rate_limit_profiles.py +848 -0
  32. codedoc_ai-0.9.1/tests/test_090_features.py +483 -0
  33. codedoc_ai-0.9.1/tests/test_graph.py +158 -0
  34. codedoc_ai-0.9.1/tests/test_parser.py +310 -0
  35. {codedoc_ai-0.8.0 → codedoc_ai-0.9.1}/tests/test_pipeline.py +293 -24
  36. codedoc_ai-0.9.1/tests/test_scanner.py +141 -0
  37. {codedoc_ai-0.8.0 → codedoc_ai-0.9.1}/tests/test_scenarios.py +52 -27
  38. codedoc_ai-0.8.0/codedoc/core/loader.py +0 -337
  39. codedoc_ai-0.8.0/codedoc/core/scanner.py +0 -192
  40. codedoc_ai-0.8.0/codedoc/parser/generic_parser.py +0 -88
  41. codedoc_ai-0.8.0/tests/test_graph.py +0 -83
  42. codedoc_ai-0.8.0/tests/test_parser.py +0 -96
  43. codedoc_ai-0.8.0/tests/test_scanner.py +0 -69
  44. {codedoc_ai-0.8.0 → codedoc_ai-0.9.1}/.env.example +0 -0
  45. {codedoc_ai-0.8.0 → codedoc_ai-0.9.1}/.github/ISSUE_TEMPLATE/bug_report.md +0 -0
  46. {codedoc_ai-0.8.0 → codedoc_ai-0.9.1}/.github/ISSUE_TEMPLATE/feature_request.md +0 -0
  47. {codedoc_ai-0.8.0 → codedoc_ai-0.9.1}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
  48. {codedoc_ai-0.8.0 → codedoc_ai-0.9.1}/CODE_OF_CONDUCT.md +0 -0
  49. {codedoc_ai-0.8.0 → codedoc_ai-0.9.1}/CONTRIBUTING.md +0 -0
  50. {codedoc_ai-0.8.0 → codedoc_ai-0.9.1}/LICENSE +0 -0
  51. {codedoc_ai-0.8.0 → codedoc_ai-0.9.1}/MANIFEST.in +0 -0
  52. {codedoc_ai-0.8.0 → codedoc_ai-0.9.1}/SECURITY.md +0 -0
  53. {codedoc_ai-0.8.0 → codedoc_ai-0.9.1}/codedoc/__main__.py +0 -0
  54. {codedoc_ai-0.8.0 → codedoc_ai-0.9.1}/codedoc/agents/__init__.py +0 -0
  55. {codedoc_ai-0.8.0 → codedoc_ai-0.9.1}/codedoc/bootstrap.py +0 -0
  56. {codedoc_ai-0.8.0 → codedoc_ai-0.9.1}/codedoc/cli/__init__.py +0 -0
  57. {codedoc_ai-0.8.0 → codedoc_ai-0.9.1}/codedoc/core/__init__.py +0 -0
  58. {codedoc_ai-0.8.0 → codedoc_ai-0.9.1}/codedoc/core/checkpoint.py +0 -0
  59. {codedoc_ai-0.8.0 → codedoc_ai-0.9.1}/codedoc/core/db.py +0 -0
  60. {codedoc_ai-0.8.0 → codedoc_ai-0.9.1}/codedoc/core/queue.py +0 -0
  61. {codedoc_ai-0.8.0 → codedoc_ai-0.9.1}/codedoc/llm/__init__.py +0 -0
  62. {codedoc_ai-0.8.0 → codedoc_ai-0.9.1}/codedoc/llm/api_provider.py +0 -0
  63. {codedoc_ai-0.8.0 → codedoc_ai-0.9.1}/codedoc/llm/base.py +0 -0
  64. {codedoc_ai-0.8.0 → codedoc_ai-0.9.1}/codedoc/llm/local_provider.py +0 -0
  65. {codedoc_ai-0.8.0 → codedoc_ai-0.9.1}/codedoc/parser/__init__.py +0 -0
  66. {codedoc_ai-0.8.0 → codedoc_ai-0.9.1}/codedoc/parser/factory.py +0 -0
  67. {codedoc_ai-0.8.0 → codedoc_ai-0.9.1}/codedoc/parser/python_parser.py +0 -0
  68. {codedoc_ai-0.8.0 → codedoc_ai-0.9.1}/codedoc/parser/react_parser.py +0 -0
  69. {codedoc_ai-0.8.0 → codedoc_ai-0.9.1}/codedoc/utils/__init__.py +0 -0
  70. {codedoc_ai-0.8.0 → codedoc_ai-0.9.1}/codedoc/utils/errors.py +0 -0
  71. {codedoc_ai-0.8.0 → codedoc_ai-0.9.1}/codedoc_ai.egg-info/dependency_links.txt +0 -0
  72. {codedoc_ai-0.8.0 → codedoc_ai-0.9.1}/codedoc_ai.egg-info/entry_points.txt +0 -0
  73. {codedoc_ai-0.8.0 → codedoc_ai-0.9.1}/codedoc_ai.egg-info/requires.txt +0 -0
  74. {codedoc_ai-0.8.0 → codedoc_ai-0.9.1}/codedoc_ai.egg-info/top_level.txt +0 -0
  75. {codedoc_ai-0.8.0 → codedoc_ai-0.9.1}/setup.cfg +0 -0
  76. {codedoc_ai-0.8.0 → codedoc_ai-0.9.1}/tests/__init__.py +0 -0
  77. {codedoc_ai-0.8.0 → codedoc_ai-0.9.1}/tests/fixtures/flutter_app/app.dart +0 -0
  78. {codedoc_ai-0.8.0 → codedoc_ai-0.9.1}/tests/fixtures/flutter_app/main.dart +0 -0
  79. {codedoc_ai-0.8.0 → codedoc_ai-0.9.1}/tests/fixtures/java_app/Main.java +0 -0
  80. {codedoc_ai-0.8.0 → codedoc_ai-0.9.1}/tests/fixtures/java_app/Service.java +0 -0
  81. {codedoc_ai-0.8.0 → codedoc_ai-0.9.1}/tests/fixtures/python_app/main.py +0 -0
  82. {codedoc_ai-0.8.0 → codedoc_ai-0.9.1}/tests/fixtures/python_app/models.py +0 -0
  83. {codedoc_ai-0.8.0 → codedoc_ai-0.9.1}/tests/fixtures/python_app/utils.py +0 -0
  84. {codedoc_ai-0.8.0 → codedoc_ai-0.9.1}/tests/fixtures/react_app/App.tsx +0 -0
  85. {codedoc_ai-0.8.0 → codedoc_ai-0.9.1}/tests/fixtures/react_app/index.html +0 -0
  86. {codedoc_ai-0.8.0 → codedoc_ai-0.9.1}/tests/fixtures/react_app/main.tsx +0 -0
  87. {codedoc_ai-0.8.0 → codedoc_ai-0.9.1}/tests/fixtures/react_app/router.tsx +0 -0
  88. {codedoc_ai-0.8.0 → codedoc_ai-0.9.1}/tests/fixtures/react_sample.tsx +0 -0
  89. {codedoc_ai-0.8.0 → codedoc_ai-0.9.1}/tests/test_agents.py +0 -0
  90. {codedoc_ai-0.8.0 → codedoc_ai-0.9.1}/tests/test_llm_mock.py +0 -0
  91. {codedoc_ai-0.8.0 → codedoc_ai-0.9.1}/tests/test_queue.py +0 -0
@@ -1,5 +1,255 @@
1
1
  # Changelog
2
2
 
3
+ ## 0.9.1 - 2026-06-08
4
+
5
+ ### Bug-fix stabilization patch (first PyPI release)
6
+
7
+ Corrective-only patch. No new features or output-shape changes.
8
+
9
+ - **A1 — entry-reachability is no longer silent.** When an entry is given,
10
+ files not reachable from it were dropped without notice. `_select_files` now
11
+ logs a clear WARNING listing the excluded files, records `stats["entry_excluded"]`,
12
+ and the CLI prints an excluded-files line. (The structural selection fix is
13
+ tracked for a later minor; this patch only removes the silent failure.)
14
+ - **A2 — a wrong `--entry` no longer silently documents the whole repo.** An
15
+ explicitly specified entry that cannot be resolved, is not in the scanned set,
16
+ resolves outside the project root, or is given when **no** supported files are
17
+ scanned, now raises `ConfigError` instead of falling back to all files or
18
+ exiting successfully. Auto-detection with no entry still documents everything.
19
+ - **A3 — parser false imports fixed.** The Go parser no longer treats arbitrary
20
+ string literals (e.g. `fmt.Println("hi")`) as imports — only string-literal
21
+ paths in `import "..."` statements and `import ( ... )` blocks are read,
22
+ comments are ignored, and raw-string (backtick) paths are supported.
23
+ Interpreted literals use Go's byte-accurate escape semantics, including
24
+ multi-byte UTF-8 `\xNN` / octal sequences and Unicode escapes. The HTML parser
25
+ no longer treats CSS `<link href>` as a code import (kept `<script src>` and
26
+ JS imports).
27
+ - **A4 — no stale/empty record substituted for a real one.** In the parallel
28
+ batch, a rate-limited file was treated as "already recorded" using state that
29
+ also included records **preloaded** from a prior run, so a *changed* file could
30
+ be restored from stale documentation instead of retried. `SafeWriter` now
31
+ tracks records written *this run* (`recorded_this_run()`); a changed,
32
+ rate-limited file is retried, and a file genuinely recorded this run recovers
33
+ its real record via `get_record()` (never an empty `{}`).
34
+ - **A5 — honest interrupt message.** Removed dead code; the Ctrl-C message is now
35
+ conditional ("…if the run reached file processing") so it never falsely claims
36
+ progress was saved when interrupted before any file was processed.
37
+ - **A6 — scanner is re-entrant.** The directory walker no longer stores state on
38
+ the function object; state lives on a per-scan `_Walker` instance.
39
+ - **Version identity.** `pyproject.toml`, `codedoc.__version__`, the CLI
40
+ `--version`, and the README all report `0.9.1`, and the automated test
41
+ (`test_version_identity_consistent`) enforces agreement across **all four**,
42
+ including the README "Current release" line.
43
+ - **Reliable tests.** `tests/conftest.py` redirects the temp root into the repo
44
+ (`.pyt_tmp`) so a locked system temp dir does not make the suite unrunnable.
45
+ (This addresses the observed locked-system-temp failure; it is not a guarantee
46
+ for every environment.)
47
+
48
+ ## 0.9.0 - 2026-06-04
49
+
50
+ ### Output preflight safety, clean INFO logs, extension list fix, configurable content truncation
51
+
52
+ ---
53
+
54
+ #### G0 — Output Preflight Safety
55
+
56
+ Foreign output targets now fail immediately with a `ConfigError` before the
57
+ scanner runs, the provider initialises, or any LLM API call is made. Previously
58
+ a foreign file at the target path would only be detected inside
59
+ `write_project_outputs`, after all tokens had already been spent.
60
+
61
+ - **`codedoc/core/output.py`**: Added `preflight_output_targets()` which calls
62
+ `_check_file_ownership()` for all final public targets (JSON, MD, both) and a
63
+ new `_check_md_live_backup_ownership()` for the MD live-backup JSON sibling.
64
+ - **`codedoc/pipeline.py`**: Calls `preflight_output_targets()` immediately after
65
+ output spec resolution, before `scan_files()` and `create_provider()`.
66
+ - **`codedoc/core/loader.py`**: `_resolve_output_spec()` now only emits the
67
+ format-conflict warning when `--format` was explicitly passed by the user (not
68
+ when the default `"json"` value from DEFAULTS triggers a mismatch).
69
+
70
+ #### G1 — Clean Log Output
71
+
72
+ Third-party HTTP libraries (`httpx`, `httpcore`, `openai`, `anthropic`,
73
+ `google.auth`) are now silenced at WARNING level by default. At `--verbose` /
74
+ DEBUG the HTTP diagnostics are restored. Per-agent progress lines appear at INFO
75
+ so users can see what codedoc is doing at each step.
76
+
77
+ - **`codedoc/utils/logger.py`**: `_NOISY_LOGGERS` constant defines the list;
78
+ `_configure()` sets those loggers to WARNING; `set_level()` lowers them to
79
+ DEBUG when the root logger is set to DEBUG.
80
+ - **`codedoc/agents/orchestrator.py`**: Added timing via `time.monotonic()` and
81
+ INFO/WARNING log lines after each agent: `[FILE] path | structure ok 0.8s`,
82
+ `[FILE] path | dependencies ok 0.9s`, `[FILE] path | documentation ok 1.2s`.
83
+ Fallbacks emit WARNING with `"fallback"` in the message.
84
+
85
+ #### G5 — Extension List Consistency
86
+
87
+ `_candidate_variants()` in `graph.py` used a hardcoded 9-extension list that
88
+ was out of sync with `_KNOWN_EXTENSIONS` and `DEFAULTS["extension_language_map"]`.
89
+ Import resolution for Go, Kotlin, Swift, Rust, Ruby, and C-family files silently
90
+ produced no candidates.
91
+
92
+ - **`codedoc/core/graph.py`**: `_KNOWN_EXTENSIONS` expanded to all 19 extensions
93
+ in `DEFAULTS["extension_language_map"]`. `_candidate_variants()` now uses
94
+ `sorted(_KNOWN_EXTENSIONS)` instead of a separate hardcoded list. A comment
95
+ notes the sync requirement with `loader.py`.
96
+
97
+ #### G6 — Configurable Content Truncation
98
+
99
+ Files above 12,000 characters were silently truncated with a DEBUG-only log.
100
+ Users saw degraded documentation for large files with no indication why.
101
+
102
+ - **`codedoc/core/loader.py`**: `max_content_chars` added to `DEFAULTS` (12000)
103
+ and `_ENV_KEY_MAP` (`CODEDOC_MAX_CONTENT_CHARS`). Validation requires a positive
104
+ integer ≥ 1000.
105
+ - **`codedoc/agents/base_agent.py`**: Removed module-level `_MAX_CONTENT_CHARS`
106
+ constant. `BaseAgent.__init__` now accepts `max_content_chars: int = 12000`.
107
+ `_truncate()` uses `self._max_content_chars` and logs at INFO with the file
108
+ path and original / truncated character counts.
109
+ - **`codedoc/agents/orchestrator.py`**: `Orchestrator.__init__` accepts
110
+ `max_content_chars: int = 12000` and forwards it to each agent.
111
+ - **`codedoc/pipeline.py`**: Passes `config.get("max_content_chars", 12000)` to
112
+ the `Orchestrator` constructor.
113
+ - All three agent subclasses pass `file_path` to `_truncate()` for accurate logs.
114
+
115
+ ---
116
+
117
+ ## 0.8.1 - 2026-06-02
118
+
119
+ ### Lossless Markdown, placeholder sanitization, configurable defaults, provider-aware rate-limit backoff
120
+
121
+ ---
122
+
123
+ #### Workstream A — Lossless Markdown View
124
+
125
+ Markdown output now embeds the complete public JSON view as a hidden base64
126
+ comment so `json_from_markdown()` (and incremental re-runs that read a `.md`
127
+ file) recover the full dependency catalog, per-file hashes, and all dependency
128
+ metadata without any information loss.
129
+
130
+ - **`codedoc/core/project_view.py`**:
131
+ - `markdown_from_view()` writes a `<!-- codedoc-ai-view-base64 ... -->` block
132
+ immediately after the legacy `<!-- codedoc-ai: ... -->` metadata comment.
133
+ The block is standard base64-encoded UTF-8 JSON, which avoids comment-safety
134
+ issues with raw `--` or `-->` sequences in generated text.
135
+ - `markdown_to_view()` now tries the embedded view first (fast, lossless path);
136
+ falls back to the existing visible Markdown parser for pre-0.8.1 files.
137
+ - New public helper `read_embedded_view(markdown)` decodes and validates the
138
+ embedded block; returns `None` on any failure so callers fall back safely.
139
+ - `read_codedoc_meta()` no longer raises `ConfigError` when `entry_file` is
140
+ `null`; a valid CodeDoc file with no entry point is now correctly identified
141
+ as owned rather than foreign.
142
+ - **`codedoc/pipeline.py`**:
143
+ - `_load_existing_file_docs_from_md()` preserves file hashes from the embedded
144
+ view when the lightweight metadata comment has no hash for a path.
145
+ - `_resolve_entry_and_docs()` no longer raises unconditionally when no existing
146
+ output is found; first runs without `--entry` now reach `detect_entry_file()`
147
+ for auto-detection instead of failing immediately.
148
+
149
+ #### Workstream B — Placeholder Usage Example Sanitization
150
+
151
+ LLM-generated usage examples that contain placeholder package names (e.g.
152
+ `import 'package:your_package/...'`) are now removed before any output is
153
+ written or cached.
154
+
155
+ - **`codedoc/core/project_view.py`**: `_clean_file()` calls the new
156
+ `_sanitize_usage_example()` helper, which checks against `_PLACEHOLDER_PATTERN`
157
+ (a compiled `re.IGNORECASE` regex with word-boundary guards). Covered
158
+ placeholders: `your_package_name`, `your_package`, `your_project`, `your_app`,
159
+ `example_package`, `my_package`, and Dart-style `package:example/`.
160
+ Sanitization is idempotent and applies to both freshly generated records and
161
+ cached/reused records loaded from prior output files.
162
+
163
+ #### Workstream C — Configurable Hardcoded Defaults
164
+
165
+ All previously hardcoded scanner and provider defaults are now driven by a
166
+ single source of truth in `DEFAULTS` (`loader.py`) and support `_add` / `_remove`
167
+ override keys.
168
+
169
+ - **`codedoc/core/loader.py`**:
170
+ - `DEFAULTS` gains eleven new keys: `skip_dirs_add`, `skip_dirs_remove`,
171
+ `extension_language_map` (full 18-entry map), `extension_language_map_add`,
172
+ `extension_language_map_remove`, `auto_entry_candidates`,
173
+ `auto_entry_candidates_add`, `auto_entry_candidates_remove`,
174
+ `provider_prefixes`, `provider_prefixes_add`, `provider_prefixes_remove`.
175
+ - Three resolver helpers implement the resolution order (replace → `_add` →
176
+ `_remove`): `_resolve_list_override`, `_resolve_dict_override`,
177
+ `_resolve_nested_list_dict_override`.
178
+ - `_apply_config_overrides()` is called after all config sources are merged;
179
+ it resolves all four configurable keys and derives `supported_extensions`
180
+ from the resolved `extension_language_map`.
181
+ - Backward-compat bridge: if `supported_extensions` was explicitly set to a
182
+ value different from the defaults, it is used as a filter on
183
+ `extension_language_map` so old configs continue to restrict scanning as
184
+ intended.
185
+ - **`codedoc/core/scanner.py`**:
186
+ - Hardcoded `SKIP_DIRS` and `EXTENSION_LANGUAGE_MAP` removed.
187
+ - `scan_files()` receives `extension_language_map` (primary) instead of
188
+ `supported_extensions`. A positional-list guard handles legacy callers
189
+ that pass a list as the second argument.
190
+ - `detect_entry_file()` receives the resolved `auto_entry_candidates` list;
191
+ falls back to a module-level default for direct callers.
192
+ - **`codedoc/pipeline.py`**: passes `extension_language_map` and
193
+ `auto_entry_candidates` to the scanner; always appends the output directory
194
+ name to the scan skip list (even when the user removed it via
195
+ `--remove-skip-dir`) to prevent codedoc from documenting its own output.
196
+ - **`codedoc/cli/cli.py`**: three new flags: `--skip-dirs DIR [...]`,
197
+ `--add-skip-dir DIR` (repeatable), `--remove-skip-dir DIR` (repeatable).
198
+ - **`codedoc/llm/factory.py`**: `create_provider()`, `_make_api()`,
199
+ `_resolve_api_provider()`, and `_provider_api_key()` all accept and use
200
+ `provider_prefixes` from config; module-level tuples kept as fallbacks.
201
+
202
+ #### Workstream D — Provider-Aware Rate-Limit Backoff
203
+
204
+ Parallel ladder step-downs now sleep between rungs using provider-aware
205
+ exponential backoff, with optional `Retry-After` hint parsing.
206
+
207
+ - **`codedoc/llm/rate_limit_profile.py`** *(new)*:
208
+ - `RateLimitProfile` dataclass — `provider`, `signals`, `min_backoff_s`,
209
+ `backoff_scale`.
210
+ - `PROVIDER_PROFILES` — preconfigured profiles for `openai`, `anthropic`,
211
+ `gemini`, and `default`.
212
+ - `get_rate_limit_profile(provider_name, config)` — returns the resolved
213
+ profile with `rate_limit_backoff_s`, `rate_limit_backoff_scale`,
214
+ `rate_limit_signals_add`, and `rate_limit_signals_remove` applied without
215
+ mutating module defaults.
216
+ - **`codedoc/pipeline.py`**:
217
+ - `_is_rate_limit_error(exc, profile=None)` — when a `profile` is supplied,
218
+ checks only `profile.signals`; falls back to `_RATE_LIMIT_SIGNALS` for
219
+ backward compatibility with callers without a profile.
220
+ - `_detect_limit_type(error_msg)` — classifies errors as `"tpm"`, `"rpm"`,
221
+ `"quota"`, `"overloaded"`, or `None`.
222
+ - `_process_descriptor_batch()` return type changed:
223
+ `retry_rate_limited` is now `list[tuple[dict, Exception]]` so the causing
224
+ exception is preserved for `Retry-After` parsing and error sampling.
225
+ - `_process_agent_files()`: fetches the provider profile, passes it to
226
+ `_process_descriptor_batch()`, and sleeps between rungs using:
227
+ - `min(Retry-After, retry_after_cap_s)` when a hint is present and
228
+ `respect_retry_after = True`,
229
+ - `min(min_backoff_s × backoff_scale ^ rung, retry_after_cap_s)` otherwise,
230
+ - no sleep when `rate_limit_backoff_s = 0`.
231
+ - Rate-limit warning dicts now include: `retry_after_s`, `sleep_s`,
232
+ `error_sample`, `limit_type`, `event_number`, `rung_index`.
233
+ - **`codedoc/core/loader.py`**: four new `DEFAULTS` keys:
234
+ `rate_limit_backoff_s`, `rate_limit_backoff_scale`, `rate_limit_signals_add`,
235
+ `rate_limit_signals_remove`.
236
+ - **`codedoc/cli/cli.py`**: compact rate-limit summary line printed only when
237
+ step-down events occurred; shows event count, providers, and total sleep time.
238
+
239
+ #### Version
240
+
241
+ - `codedoc/__init__.py`, `pyproject.toml`, `cli.py`: `0.8.0` → `0.8.1`.
242
+
243
+ #### Validation
244
+
245
+ - Added regression coverage for lossless Markdown regeneration, placeholder
246
+ sanitization, configurable defaults, provider-aware rate-limit backoff, and
247
+ rate-limit edge cases.
248
+ - Full test suite passes.
249
+ - Built sdist/wheel and verified release metadata with `twine check`.
250
+
251
+ ---
252
+
3
253
  ## 0.8.0 - 2026-05-31
4
254
 
5
255
  ### Always-on live JSON crash backup, parallel crash-safety, rate-limit adaptive parallelism, error.log overhaul
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: codedoc-ai
3
- Version: 0.8.0
3
+ Version: 0.9.1
4
4
  Summary: Generate structured, incremental documentation for any codebase using OpenAI, Anthropic, or Gemini
5
5
  Author: Atharv Mannur
6
6
  License-Expression: MIT
@@ -40,7 +40,7 @@ Dynamic: license-file
40
40
 
41
41
  The tool scans source files, resolves project-local imports into a dependency graph, sends only files that need analysis to an LLM, and writes one combined, structured documentation artifact designed for both humans and AI. By default that artifact is JSON.
42
42
 
43
- Current release: `0.8.0`.
43
+ Current release: `0.9.1`.
44
44
 
45
45
  ## What It Does
46
46
 
@@ -91,6 +91,7 @@ codedoc run
91
91
  | Live JSON backup | always on (0.8.0 default) |
92
92
  | Rate-limit adaptive | `true` |
93
93
  | Max file size | `500 KB` |
94
+ | Max content chars | `12000` |
94
95
 
95
96
  Because the default provider uses the OpenAI API, a user must supply an API key unless they select a different provider.
96
97
 
@@ -340,7 +341,47 @@ Create `codedoc.config.json` in the project being documented:
340
341
  "parallel_ladder": null,
341
342
  "respect_retry_after": true,
342
343
  "retry_after_cap_s": 30,
344
+ "rate_limit_backoff_s": null,
345
+ "rate_limit_backoff_scale": null,
346
+ "rate_limit_signals_add": [],
347
+ "rate_limit_signals_remove": [],
343
348
  "skip_dirs": ["myenv", ".venv", "venv", "env", "node_modules", "__pycache__", "codedoc"],
349
+ "skip_dirs_add": [],
350
+ "skip_dirs_remove": [],
351
+ "max_content_chars": 12000,
352
+ "extension_language_map": {
353
+ ".py": "python",
354
+ ".ts": "typescript",
355
+ ".tsx": "tsx",
356
+ ".js": "javascript",
357
+ ".jsx": "jsx",
358
+ ".dart": "dart",
359
+ ".java": "java",
360
+ ".cs": "csharp",
361
+ ".html": "html",
362
+ ".htm": "html",
363
+ ".kt": "kotlin",
364
+ ".swift": "swift",
365
+ ".go": "go",
366
+ ".rb": "ruby",
367
+ ".rs": "rust",
368
+ ".cpp": "cpp",
369
+ ".c": "c",
370
+ ".h": "c",
371
+ ".hpp": "cpp"
372
+ },
373
+ "extension_language_map_add": {},
374
+ "extension_language_map_remove": [],
375
+ "auto_entry_candidates": ["index.html", "main.tsx", "main.ts", "main.js", "main.py", "main.dart", "Main.java", "Program.cs"],
376
+ "auto_entry_candidates_add": [],
377
+ "auto_entry_candidates_remove": [],
378
+ "provider_prefixes": {
379
+ "anthropic": ["claude"],
380
+ "gemini": ["gemini"],
381
+ "openai": ["gpt-", "o1", "o3", "text-"]
382
+ },
383
+ "provider_prefixes_add": {},
384
+ "provider_prefixes_remove": {},
344
385
  "ignore_paths": ["/myenv", "services/generated"]
345
386
  }
346
387
  ```
@@ -369,6 +410,21 @@ Parallelism settings:
369
410
  | `file_retry_attempts` | Number of sequential retries for a failed file. Default: `1`. |
370
411
  | `max_consecutive_failures` | Stops the run after repeated failures so provider/API problems are visible quickly. Default: `5`. |
371
412
 
413
+ Configurable defaults added in 0.8.1:
414
+
415
+ | Setting | Purpose |
416
+ | --- | --- |
417
+ | `skip_dirs`, `skip_dirs_add`, `skip_dirs_remove` | Replace, extend, or reduce directory names skipped anywhere in the tree. Use `--remove-skip-dir codedoc` to document this package source while codedoc still skips its output directory. |
418
+ | `extension_language_map`, `extension_language_map_add`, `extension_language_map_remove` | Control which extensions are scanned and what language label each gets. Any extension in the resolved map is supported. |
419
+ | `auto_entry_candidates`, `auto_entry_candidates_add`, `auto_entry_candidates_remove` | Control first-run entry auto-detection when `--entry` is omitted. |
420
+ | `provider_prefixes`, `provider_prefixes_add`, `provider_prefixes_remove` | Control model-name based provider auto-detection and matching API-key lookup. |
421
+
422
+ Configurable settings added in 0.9.0:
423
+
424
+ | Setting | Default | Purpose |
425
+ | --- | --- | --- |
426
+ | `max_content_chars` | `12000` | Maximum characters of file content sent to the LLM per file. Files longer than this are truncated and an INFO log line is emitted with the file path and character counts. Raise this for large-context providers (`60000`–`100000`). Must be at least `1000`. |
427
+
372
428
  ## Environment Variables
373
429
 
374
430
  Secrets should live in environment variables or a local `.env` file that is ignored by Git. Use [.env.example](.env.example) as the template.
@@ -393,6 +449,7 @@ Supported variables:
393
449
  | `CODEDOC_MAX_CONSECUTIVE_FAILURES` | Consecutive failure threshold before stopping. |
394
450
  | `LOG_LEVEL` | `INFO`, `DEBUG`, etc. |
395
451
  | `CODEDOC_IGNORE_PATHS` | Semicolon-separated ignore paths. |
452
+ | `CODEDOC_MAX_CONTENT_CHARS` | Maximum characters of file content sent to the LLM. Equivalent to `max_content_chars` in config. |
396
453
 
397
454
  Example `.env` for OpenAI:
398
455
 
@@ -634,7 +691,7 @@ with the final clean output.
634
691
  and now has no effect — live backup is always on. Passing it prints a deprecation
635
692
  notice. It will be removed in a future release.
636
693
 
637
- ### Adaptive rate-limit parallelism (0.8.0)
694
+ ### Adaptive rate-limit parallelism (0.8.1)
638
695
 
639
696
  When a provider signals 429 / rate-limit / quota-exceeded, codedoc automatically
640
697
  steps down file-level concurrency instead of hammering the API:
@@ -656,9 +713,34 @@ Customize it in config:
656
713
  }
657
714
  ```
658
715
 
659
- Provider-specific rate-limit signals are recognised for OpenAI (`429`,
660
- `rate_limit_exceeded`, `tpm`), Anthropic (`529`, `overloaded`), and Gemini
661
- (`RESOURCE_EXHAUSTED`, `quota`). Non-rate-limit errors never trigger a step-down.
716
+ Provider-specific rate-limit signals are recognised for OpenAI (`429`, `rate limit`,
717
+ `rate_limit`, `too many requests`, `tokens per min`, `tpm`, `quota`), Anthropic
718
+ (`529`, `overloaded`, `rate_limit`, `429`), and Gemini (`resource_exhausted`,
719
+ `quota`, `429`, `503`). Non-rate-limit errors never trigger a step-down.
720
+
721
+ In 0.8.1, codedoc sleeps between parallel step-down rungs using provider-aware
722
+ backoff. You can tune this in config:
723
+
724
+ ```json
725
+ {
726
+ "rate_limit_backoff_s": null,
727
+ "rate_limit_backoff_scale": null,
728
+ "rate_limit_signals_add": ["capacity exceeded", "throttled"],
729
+ "rate_limit_signals_remove": ["503"]
730
+ }
731
+ ```
732
+
733
+ Set `rate_limit_backoff_s` to `0` to disable computed inter-rung backoff.
734
+ `Retry-After` hints are still honored when `respect_retry_after` is true.
735
+
736
+ ### Lossless Markdown regeneration (0.8.1)
737
+
738
+ Markdown output remains human-readable, but codedoc now embeds a hidden
739
+ base64-encoded public JSON view in a `<!-- codedoc-ai-view-base64 ... -->`
740
+ comment. This lets later Markdown-to-JSON conversion and incremental re-runs
741
+ recover dependency catalogs, per-file dependency metadata, links, and hashes
742
+ without another LLM call. Legacy Markdown without the embedded view still uses
743
+ the best-effort visible Markdown parser.
662
744
 
663
745
  ### Issue log (`error.log`)
664
746
 
@@ -676,11 +758,16 @@ Only hard file failures are surfaced there.
676
758
 
677
759
  ### Ownership guard
678
760
 
679
- Before writing, `codedoc` checks that any existing file at the target path was
680
- produced by codedoc (a `_codedoc` metadata block in JSON, or a `<!-- codedoc-ai: -->`
681
- comment in Markdown). If the file is foreign, malformed, or empty, the run stops
682
- with a clear `ConfigError` instead of overwriting it. Choose a different
683
- `--output` directory or remove the conflicting file to proceed.
761
+ `codedoc` checks that any existing file at the target path was produced by
762
+ codedoc (a `_codedoc` metadata block in JSON, or a `<!-- codedoc-ai: -->` comment
763
+ in Markdown). If the file is foreign, malformed, or empty, the run stops with a
764
+ clear `ConfigError`. Choose a different `--output` directory or remove the
765
+ conflicting file to proceed.
766
+
767
+ **Preflight (0.9.0).** The ownership check now runs *before* any filesystem
768
+ changes, directory creation, scanning, or LLM calls. A foreign target that would
769
+ block the final write is caught immediately — no tokens are spent and no output
770
+ directory is created.
684
771
 
685
772
  ### More detail
686
773
 
@@ -4,7 +4,7 @@
4
4
 
5
5
  The tool scans source files, resolves project-local imports into a dependency graph, sends only files that need analysis to an LLM, and writes one combined, structured documentation artifact designed for both humans and AI. By default that artifact is JSON.
6
6
 
7
- Current release: `0.8.0`.
7
+ Current release: `0.9.1`.
8
8
 
9
9
  ## What It Does
10
10
 
@@ -55,6 +55,7 @@ codedoc run
55
55
  | Live JSON backup | always on (0.8.0 default) |
56
56
  | Rate-limit adaptive | `true` |
57
57
  | Max file size | `500 KB` |
58
+ | Max content chars | `12000` |
58
59
 
59
60
  Because the default provider uses the OpenAI API, a user must supply an API key unless they select a different provider.
60
61
 
@@ -304,7 +305,47 @@ Create `codedoc.config.json` in the project being documented:
304
305
  "parallel_ladder": null,
305
306
  "respect_retry_after": true,
306
307
  "retry_after_cap_s": 30,
308
+ "rate_limit_backoff_s": null,
309
+ "rate_limit_backoff_scale": null,
310
+ "rate_limit_signals_add": [],
311
+ "rate_limit_signals_remove": [],
307
312
  "skip_dirs": ["myenv", ".venv", "venv", "env", "node_modules", "__pycache__", "codedoc"],
313
+ "skip_dirs_add": [],
314
+ "skip_dirs_remove": [],
315
+ "max_content_chars": 12000,
316
+ "extension_language_map": {
317
+ ".py": "python",
318
+ ".ts": "typescript",
319
+ ".tsx": "tsx",
320
+ ".js": "javascript",
321
+ ".jsx": "jsx",
322
+ ".dart": "dart",
323
+ ".java": "java",
324
+ ".cs": "csharp",
325
+ ".html": "html",
326
+ ".htm": "html",
327
+ ".kt": "kotlin",
328
+ ".swift": "swift",
329
+ ".go": "go",
330
+ ".rb": "ruby",
331
+ ".rs": "rust",
332
+ ".cpp": "cpp",
333
+ ".c": "c",
334
+ ".h": "c",
335
+ ".hpp": "cpp"
336
+ },
337
+ "extension_language_map_add": {},
338
+ "extension_language_map_remove": [],
339
+ "auto_entry_candidates": ["index.html", "main.tsx", "main.ts", "main.js", "main.py", "main.dart", "Main.java", "Program.cs"],
340
+ "auto_entry_candidates_add": [],
341
+ "auto_entry_candidates_remove": [],
342
+ "provider_prefixes": {
343
+ "anthropic": ["claude"],
344
+ "gemini": ["gemini"],
345
+ "openai": ["gpt-", "o1", "o3", "text-"]
346
+ },
347
+ "provider_prefixes_add": {},
348
+ "provider_prefixes_remove": {},
308
349
  "ignore_paths": ["/myenv", "services/generated"]
309
350
  }
310
351
  ```
@@ -333,6 +374,21 @@ Parallelism settings:
333
374
  | `file_retry_attempts` | Number of sequential retries for a failed file. Default: `1`. |
334
375
  | `max_consecutive_failures` | Stops the run after repeated failures so provider/API problems are visible quickly. Default: `5`. |
335
376
 
377
+ Configurable defaults added in 0.8.1:
378
+
379
+ | Setting | Purpose |
380
+ | --- | --- |
381
+ | `skip_dirs`, `skip_dirs_add`, `skip_dirs_remove` | Replace, extend, or reduce directory names skipped anywhere in the tree. Use `--remove-skip-dir codedoc` to document this package source while codedoc still skips its output directory. |
382
+ | `extension_language_map`, `extension_language_map_add`, `extension_language_map_remove` | Control which extensions are scanned and what language label each gets. Any extension in the resolved map is supported. |
383
+ | `auto_entry_candidates`, `auto_entry_candidates_add`, `auto_entry_candidates_remove` | Control first-run entry auto-detection when `--entry` is omitted. |
384
+ | `provider_prefixes`, `provider_prefixes_add`, `provider_prefixes_remove` | Control model-name based provider auto-detection and matching API-key lookup. |
385
+
386
+ Configurable settings added in 0.9.0:
387
+
388
+ | Setting | Default | Purpose |
389
+ | --- | --- | --- |
390
+ | `max_content_chars` | `12000` | Maximum characters of file content sent to the LLM per file. Files longer than this are truncated and an INFO log line is emitted with the file path and character counts. Raise this for large-context providers (`60000`–`100000`). Must be at least `1000`. |
391
+
336
392
  ## Environment Variables
337
393
 
338
394
  Secrets should live in environment variables or a local `.env` file that is ignored by Git. Use [.env.example](.env.example) as the template.
@@ -357,6 +413,7 @@ Supported variables:
357
413
  | `CODEDOC_MAX_CONSECUTIVE_FAILURES` | Consecutive failure threshold before stopping. |
358
414
  | `LOG_LEVEL` | `INFO`, `DEBUG`, etc. |
359
415
  | `CODEDOC_IGNORE_PATHS` | Semicolon-separated ignore paths. |
416
+ | `CODEDOC_MAX_CONTENT_CHARS` | Maximum characters of file content sent to the LLM. Equivalent to `max_content_chars` in config. |
360
417
 
361
418
  Example `.env` for OpenAI:
362
419
 
@@ -598,7 +655,7 @@ with the final clean output.
598
655
  and now has no effect — live backup is always on. Passing it prints a deprecation
599
656
  notice. It will be removed in a future release.
600
657
 
601
- ### Adaptive rate-limit parallelism (0.8.0)
658
+ ### Adaptive rate-limit parallelism (0.8.1)
602
659
 
603
660
  When a provider signals 429 / rate-limit / quota-exceeded, codedoc automatically
604
661
  steps down file-level concurrency instead of hammering the API:
@@ -620,9 +677,34 @@ Customize it in config:
620
677
  }
621
678
  ```
622
679
 
623
- Provider-specific rate-limit signals are recognised for OpenAI (`429`,
624
- `rate_limit_exceeded`, `tpm`), Anthropic (`529`, `overloaded`), and Gemini
625
- (`RESOURCE_EXHAUSTED`, `quota`). Non-rate-limit errors never trigger a step-down.
680
+ Provider-specific rate-limit signals are recognised for OpenAI (`429`, `rate limit`,
681
+ `rate_limit`, `too many requests`, `tokens per min`, `tpm`, `quota`), Anthropic
682
+ (`529`, `overloaded`, `rate_limit`, `429`), and Gemini (`resource_exhausted`,
683
+ `quota`, `429`, `503`). Non-rate-limit errors never trigger a step-down.
684
+
685
+ In 0.8.1, codedoc sleeps between parallel step-down rungs using provider-aware
686
+ backoff. You can tune this in config:
687
+
688
+ ```json
689
+ {
690
+ "rate_limit_backoff_s": null,
691
+ "rate_limit_backoff_scale": null,
692
+ "rate_limit_signals_add": ["capacity exceeded", "throttled"],
693
+ "rate_limit_signals_remove": ["503"]
694
+ }
695
+ ```
696
+
697
+ Set `rate_limit_backoff_s` to `0` to disable computed inter-rung backoff.
698
+ `Retry-After` hints are still honored when `respect_retry_after` is true.
699
+
700
+ ### Lossless Markdown regeneration (0.8.1)
701
+
702
+ Markdown output remains human-readable, but codedoc now embeds a hidden
703
+ base64-encoded public JSON view in a `<!-- codedoc-ai-view-base64 ... -->`
704
+ comment. This lets later Markdown-to-JSON conversion and incremental re-runs
705
+ recover dependency catalogs, per-file dependency metadata, links, and hashes
706
+ without another LLM call. Legacy Markdown without the embedded view still uses
707
+ the best-effort visible Markdown parser.
626
708
 
627
709
  ### Issue log (`error.log`)
628
710
 
@@ -640,11 +722,16 @@ Only hard file failures are surfaced there.
640
722
 
641
723
  ### Ownership guard
642
724
 
643
- Before writing, `codedoc` checks that any existing file at the target path was
644
- produced by codedoc (a `_codedoc` metadata block in JSON, or a `<!-- codedoc-ai: -->`
645
- comment in Markdown). If the file is foreign, malformed, or empty, the run stops
646
- with a clear `ConfigError` instead of overwriting it. Choose a different
647
- `--output` directory or remove the conflicting file to proceed.
725
+ `codedoc` checks that any existing file at the target path was produced by
726
+ codedoc (a `_codedoc` metadata block in JSON, or a `<!-- codedoc-ai: -->` comment
727
+ in Markdown). If the file is foreign, malformed, or empty, the run stops with a
728
+ clear `ConfigError`. Choose a different `--output` directory or remove the
729
+ conflicting file to proceed.
730
+
731
+ **Preflight (0.9.0).** The ownership check now runs *before* any filesystem
732
+ changes, directory creation, scanning, or LLM calls. A foreign target that would
733
+ block the final write is caught immediately — no tokens are spent and no output
734
+ directory is created.
648
735
 
649
736
  ### More detail
650
737
 
@@ -203,15 +203,38 @@ main thread, so a Ctrl-C or crash after a worker completes never discards that r
203
203
  - **Ownership guard.** `codedoc` refuses to overwrite a file it did not create (no `_codedoc`
204
204
  metadata block) — including the JSON backup sibling for named-MD runs.
205
205
 
206
- **Rate-limit step-down (0.8.0):**
206
+ **Rate-limit step-down (0.8.1):**
207
207
  When a rate-limit signal is detected during parallel processing, codedoc steps down the
208
- file concurrency ladder and prints a provider-specific notice to the terminal:
208
+ file concurrency ladder, sleeps using provider-aware exponential backoff, and prints a
209
+ notice to the terminal:
209
210
 
210
211
  ```
211
- [OpenAI] Rate limit detected - your configured max_parallel_files (5) has been
212
- reduced to 2. Retrying 4 remaining file(s) at lower concurrency.
212
+ [anthropic] Rate limit detected - your configured max_parallel_files (5) has been
213
+ reduced to 2. Retrying 4 remaining file(s) at lower concurrency. Sleeping 10.0s before retry.
213
214
  ```
214
215
 
216
+ At the end of the run, a compact summary line is printed only when step-down events
217
+ occurred:
218
+
219
+ ```
220
+ Rate limits: 1 step-down event(s) [anthropic], 10.0s total backoff. Details in error.log.
221
+ ```
222
+
223
+ Backoff behavior (provider defaults, all overridable via config):
224
+
225
+ | Provider | Signals | Min backoff | Scale |
226
+ |-----------|-------------------------------------------------------|------------:|------:|
227
+ | openai | 429, rate limit, tpm, quota, ... | 5 s | 1.5× |
228
+ | anthropic | 529, overloaded, rate_limit, 429 | 10 s | 2.0× |
229
+ | gemini | resource_exhausted, quota, 429, 503 | 8 s | 1.5× |
230
+ | default | (union of all above) | 5 s | 1.5× |
231
+
232
+ Config overrides:
233
+ - `rate_limit_backoff_s` — override min backoff globally (`0` disables sleep).
234
+ - `rate_limit_backoff_scale` — override exponential scale globally.
235
+ - `rate_limit_signals_add` — add extra signal strings (for custom gateways).
236
+ - `rate_limit_signals_remove` — remove signals from the resolved profile.
237
+
215
238
  Recovered rate-limit events appear in `error.log` (located in the output directory,
216
239
  not the project root) as warnings, and do not alarm the final output.
217
240
 
@@ -1,6 +1,6 @@
1
1
  """codedoc: local-first, LLM-agnostic codebase documentation."""
2
2
 
3
- __version__ = "0.8.0"
3
+ __version__ = "0.9.1"
4
4
  __author__ = "codedoc contributors"
5
5
 
6
6