requirements-as-code 0.3.0__tar.gz → 0.3.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. {requirements_as_code-0.3.0/requirements_as_code.egg-info → requirements_as_code-0.3.1}/PKG-INFO +26 -7
  2. {requirements_as_code-0.3.0 → requirements_as_code-0.3.1}/README.md +15 -5
  3. requirements_as_code-0.3.1/planning/roadmap/v0.3.1-formats.md +29 -0
  4. {requirements_as_code-0.3.0 → requirements_as_code-0.3.1}/pyproject.toml +16 -4
  5. {requirements_as_code-0.3.0 → requirements_as_code-0.3.1}/rac/cli.py +1 -1
  6. {requirements_as_code-0.3.0 → requirements_as_code-0.3.1}/rac/ingest.py +44 -5
  7. {requirements_as_code-0.3.0 → requirements_as_code-0.3.1/requirements_as_code.egg-info}/PKG-INFO +26 -7
  8. {requirements_as_code-0.3.0 → requirements_as_code-0.3.1}/requirements_as_code.egg-info/SOURCES.txt +1 -0
  9. requirements_as_code-0.3.1/requirements_as_code.egg-info/requires.txt +21 -0
  10. {requirements_as_code-0.3.0 → requirements_as_code-0.3.1}/tests/test_ingest.py +86 -3
  11. requirements_as_code-0.3.0/requirements_as_code.egg-info/requires.txt +0 -9
  12. {requirements_as_code-0.3.0 → requirements_as_code-0.3.1}/.github/workflows/python-publish.yml +0 -0
  13. {requirements_as_code-0.3.0 → requirements_as_code-0.3.1}/.gitignore +0 -0
  14. {requirements_as_code-0.3.0 → requirements_as_code-0.3.1}/LICENSE +0 -0
  15. {requirements_as_code-0.3.0 → requirements_as_code-0.3.1}/examples/example_dashboard_v1.md +0 -0
  16. {requirements_as_code-0.3.0 → requirements_as_code-0.3.1}/examples/example_dashboard_v2.md +0 -0
  17. {requirements_as_code-0.3.0 → requirements_as_code-0.3.1}/planning/adr/adr-001-markdown-first.md +0 -0
  18. {requirements_as_code-0.3.0 → requirements_as_code-0.3.1}/planning/adr/adr-002-ai-optional.md +0 -0
  19. {requirements_as_code-0.3.0 → requirements_as_code-0.3.1}/planning/adr/adr-003-structured-outputs-first.md +0 -0
  20. {requirements_as_code-0.3.0 → requirements_as_code-0.3.1}/planning/adr/adr-004-artifact-model.md +0 -0
  21. {requirements_as_code-0.3.0 → requirements_as_code-0.3.1}/planning/adr/adr-005-cli-first.md +0 -0
  22. {requirements_as_code-0.3.0 → requirements_as_code-0.3.1}/planning/adr/adr-006-ingest-over-rewrite.md +0 -0
  23. {requirements_as_code-0.3.0 → requirements_as_code-0.3.1}/planning/adr/adr-007-json-contract-stability.md +0 -0
  24. {requirements_as_code-0.3.0 → requirements_as_code-0.3.1}/planning/adr/adr-008-agent-ready-architecture.md +0 -0
  25. {requirements_as_code-0.3.0 → requirements_as_code-0.3.1}/planning/adr/adr-009-ai-assisted-development +0 -0
  26. {requirements_as_code-0.3.0 → requirements_as_code-0.3.1}/planning/adr/adr-010-documents-are-not-artifacts.md +0 -0
  27. {requirements_as_code-0.3.0 → requirements_as_code-0.3.1}/planning/adr/adr-011-file-first-pipeline.md +0 -0
  28. {requirements_as_code-0.3.0 → requirements_as_code-0.3.1}/planning/future/v1.0-workspace-analysis.md +0 -0
  29. {requirements_as_code-0.3.0 → requirements_as_code-0.3.1}/planning/future/v1.1-review-engine.md +0 -0
  30. {requirements_as_code-0.3.0 → requirements_as_code-0.3.1}/planning/future/v1.2-mcp-server.md +0 -0
  31. {requirements_as_code-0.3.0 → requirements_as_code-0.3.1}/planning/future/v1.4-claude-skills.md +0 -0
  32. {requirements_as_code-0.3.0 → requirements_as_code-0.3.1}/planning/future/v1.4-python-sdk.md +0 -0
  33. {requirements_as_code-0.3.0 → requirements_as_code-0.3.1}/planning/roadmap/v0.2-stats.md +0 -0
  34. {requirements_as_code-0.3.0 → requirements_as_code-0.3.1}/planning/roadmap/v0.3-ingest.md +0 -0
  35. {requirements_as_code-0.3.0 → requirements_as_code-0.3.1}/planning/roadmap/v0.4-inspect.md +0 -0
  36. {requirements_as_code-0.3.0 → requirements_as_code-0.3.1}/planning/roadmap/v0.5-decisions.md +0 -0
  37. {requirements_as_code-0.3.0 → requirements_as_code-0.3.1}/planning/roadmap/v0.6-roadmaps.md +0 -0
  38. {requirements_as_code-0.3.0 → requirements_as_code-0.3.1}/planning/roadmap/v0.7-prompts.md +0 -0
  39. {requirements_as_code-0.3.0 → requirements_as_code-0.3.1}/rac/__init__.py +0 -0
  40. {requirements_as_code-0.3.0 → requirements_as_code-0.3.1}/rac/diff.py +0 -0
  41. {requirements_as_code-0.3.0 → requirements_as_code-0.3.1}/rac/models.py +0 -0
  42. {requirements_as_code-0.3.0 → requirements_as_code-0.3.1}/rac/outputs.py +0 -0
  43. {requirements_as_code-0.3.0 → requirements_as_code-0.3.1}/rac/parser.py +0 -0
  44. {requirements_as_code-0.3.0 → requirements_as_code-0.3.1}/rac/stats.py +0 -0
  45. {requirements_as_code-0.3.0 → requirements_as_code-0.3.1}/rac/validate.py +0 -0
  46. {requirements_as_code-0.3.0 → requirements_as_code-0.3.1}/requirements_as_code.egg-info/dependency_links.txt +0 -0
  47. {requirements_as_code-0.3.0 → requirements_as_code-0.3.1}/requirements_as_code.egg-info/entry_points.txt +0 -0
  48. {requirements_as_code-0.3.0 → requirements_as_code-0.3.1}/requirements_as_code.egg-info/top_level.txt +0 -0
  49. {requirements_as_code-0.3.0 → requirements_as_code-0.3.1}/setup.cfg +0 -0
  50. {requirements_as_code-0.3.0 → requirements_as_code-0.3.1}/tests/conftest.py +0 -0
  51. {requirements_as_code-0.3.0 → requirements_as_code-0.3.1}/tests/fixtures/diff/new.md +0 -0
  52. {requirements_as_code-0.3.0 → requirements_as_code-0.3.1}/tests/fixtures/diff/old.md +0 -0
  53. {requirements_as_code-0.3.0 → requirements_as_code-0.3.1}/tests/fixtures/ingest/sample.md +0 -0
  54. {requirements_as_code-0.3.0 → requirements_as_code-0.3.1}/tests/fixtures/invalid/duplicate_ids.md +0 -0
  55. {requirements_as_code-0.3.0 → requirements_as_code-0.3.1}/tests/fixtures/invalid/empty_req_text.md +0 -0
  56. {requirements_as_code-0.3.0 → requirements_as_code-0.3.1}/tests/fixtures/invalid/malformed_id.md +0 -0
  57. {requirements_as_code-0.3.0 → requirements_as_code-0.3.1}/tests/fixtures/invalid/missing_id.md +0 -0
  58. {requirements_as_code-0.3.0 → requirements_as_code-0.3.1}/tests/fixtures/invalid/missing_problem.md +0 -0
  59. {requirements_as_code-0.3.0 → requirements_as_code-0.3.1}/tests/fixtures/invalid/missing_requirements.md +0 -0
  60. {requirements_as_code-0.3.0 → requirements_as_code-0.3.1}/tests/fixtures/invalid/missing_title.md +0 -0
  61. {requirements_as_code-0.3.0 → requirements_as_code-0.3.1}/tests/fixtures/invalid/multiple_titles.md +0 -0
  62. {requirements_as_code-0.3.0 → requirements_as_code-0.3.1}/tests/fixtures/portfolio/broken.md +0 -0
  63. {requirements_as_code-0.3.0 → requirements_as_code-0.3.1}/tests/fixtures/portfolio/feature_a.md +0 -0
  64. {requirements_as_code-0.3.0 → requirements_as_code-0.3.1}/tests/fixtures/portfolio/feature_b.md +0 -0
  65. {requirements_as_code-0.3.0 → requirements_as_code-0.3.1}/tests/fixtures/portfolio/sub/feature_c.md +0 -0
  66. {requirements_as_code-0.3.0 → requirements_as_code-0.3.1}/tests/fixtures/valid/bullet_requirements.md +0 -0
  67. {requirements_as_code-0.3.0 → requirements_as_code-0.3.1}/tests/fixtures/valid/feature.md +0 -0
  68. {requirements_as_code-0.3.0 → requirements_as_code-0.3.1}/tests/fixtures/valid/minimal.md +0 -0
  69. {requirements_as_code-0.3.0 → requirements_as_code-0.3.1}/tests/fixtures/valid/warnings.md +0 -0
  70. {requirements_as_code-0.3.0 → requirements_as_code-0.3.1}/tests/test_cli.py +0 -0
  71. {requirements_as_code-0.3.0 → requirements_as_code-0.3.1}/tests/test_diff.py +0 -0
  72. {requirements_as_code-0.3.0 → requirements_as_code-0.3.1}/tests/test_parser.py +0 -0
  73. {requirements_as_code-0.3.0 → requirements_as_code-0.3.1}/tests/test_stats.py +0 -0
  74. {requirements_as_code-0.3.0 → requirements_as_code-0.3.1}/tests/test_validate.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: requirements-as-code
3
- Version: 0.3.0
3
+ Version: 0.3.1
4
4
  Summary: RAC — lint and diff product requirements written in Markdown.
5
5
  Author: tcballard
6
6
  License-Expression: MIT
@@ -23,10 +23,19 @@ License-File: LICENSE
23
23
  Requires-Dist: markdown-it-py>=3.0
24
24
  Provides-Extra: ingest
25
25
  Requires-Dist: markitdown[docx]; extra == "ingest"
26
+ Provides-Extra: ingest-pdf
27
+ Requires-Dist: markitdown[pdf]; extra == "ingest-pdf"
28
+ Provides-Extra: ingest-office
29
+ Requires-Dist: markitdown[pptx,xls,xlsx]; extra == "ingest-office"
30
+ Provides-Extra: ingest-all
31
+ Requires-Dist: markitdown[docx,pdf,pptx,xls,xlsx]; extra == "ingest-all"
26
32
  Provides-Extra: dev
27
33
  Requires-Dist: pytest>=7.0; extra == "dev"
28
- Requires-Dist: markitdown[docx]; extra == "dev"
34
+ Requires-Dist: markitdown[docx,pdf,pptx,xls,xlsx]; extra == "dev"
29
35
  Requires-Dist: python-docx; extra == "dev"
36
+ Requires-Dist: python-pptx; extra == "dev"
37
+ Requires-Dist: openpyxl; extra == "dev"
38
+ Requires-Dist: reportlab; extra == "dev"
30
39
  Dynamic: license-file
31
40
 
32
41
  # RAC (Requirements-as-Code)
@@ -436,15 +445,25 @@ rac ingest spec.docx --json # { source, converter, output, markdown }
436
445
  ```
437
446
 
438
447
  Conversion is powered by [MarkItDown](https://github.com/microsoft/markitdown),
439
- installed via the optional `ingest` extra:
448
+ installed via optional extras — split by format so you only pull the readers you
449
+ need:
450
+
451
+ | Extra | Adds | Formats |
452
+ |-------|------|---------|
453
+ | `ingest` | `markitdown[docx]` | DOCX, HTML, Markdown |
454
+ | `ingest-pdf` | `markitdown[pdf]` | + PDF |
455
+ | `ingest-office` | `markitdown[pptx,xlsx,xls]` | + PPTX, XLSX, XLS |
456
+ | `ingest-all` | everything above | all supported formats |
440
457
 
441
458
  ```bash
442
- pip install "requirements-as-code[ingest]"
459
+ pip install "requirements-as-code[ingest]" # DOCX + HTML + Markdown
460
+ pip install "requirements-as-code[ingest-all]" # everything
443
461
  ```
444
462
 
445
- Supported today: **DOCX** and **Markdown** (pass-through). HTML and PDF are
446
- planned for v0.3.x. Converters live behind a `DocumentConverter` abstraction, so
447
- new sources can be added without changing the CLI.
463
+ HTML and Markdown need no extra (HTML is built into MarkItDown; Markdown is a
464
+ pass-through). If a file's reader isn't installed, `rac ingest` tells you exactly
465
+ which extra to install. Converters live behind a `DocumentConverter` abstraction,
466
+ so new sources can be added without changing the CLI.
448
467
 
449
468
  `ingest` exits `0` on success, `1` if a recognized document fails to convert, and
450
469
  `2` for usage errors (file not found, unsupported type, missing `ingest` extra,
@@ -405,15 +405,25 @@ rac ingest spec.docx --json # { source, converter, output, markdown }
405
405
  ```
406
406
 
407
407
  Conversion is powered by [MarkItDown](https://github.com/microsoft/markitdown),
408
- installed via the optional `ingest` extra:
408
+ installed via optional extras — split by format so you only pull the readers you
409
+ need:
410
+
411
+ | Extra | Adds | Formats |
412
+ |-------|------|---------|
413
+ | `ingest` | `markitdown[docx]` | DOCX, HTML, Markdown |
414
+ | `ingest-pdf` | `markitdown[pdf]` | + PDF |
415
+ | `ingest-office` | `markitdown[pptx,xlsx,xls]` | + PPTX, XLSX, XLS |
416
+ | `ingest-all` | everything above | all supported formats |
409
417
 
410
418
  ```bash
411
- pip install "requirements-as-code[ingest]"
419
+ pip install "requirements-as-code[ingest]" # DOCX + HTML + Markdown
420
+ pip install "requirements-as-code[ingest-all]" # everything
412
421
  ```
413
422
 
414
- Supported today: **DOCX** and **Markdown** (pass-through). HTML and PDF are
415
- planned for v0.3.x. Converters live behind a `DocumentConverter` abstraction, so
416
- new sources can be added without changing the CLI.
423
+ HTML and Markdown need no extra (HTML is built into MarkItDown; Markdown is a
424
+ pass-through). If a file's reader isn't installed, `rac ingest` tells you exactly
425
+ which extra to install. Converters live behind a `DocumentConverter` abstraction,
426
+ so new sources can be added without changing the CLI.
417
427
 
418
428
  `ingest` exits `0` on success, `1` if a recognized document fails to convert, and
419
429
  `2` for usage errors (file not found, unsupported type, missing `ingest` extra,
@@ -0,0 +1,29 @@
1
+ # v0.3.1 Ingestion Formats
2
+
3
+ ## Problem
4
+
5
+ v0.3 shipped document ingestion for DOCX and Markdown only.
6
+
7
+ Product knowledge also lives in HTML pages, PDFs, slide decks, and spreadsheets, and users should not have to convert those by hand before ingesting them.
8
+
9
+ ## Requirements
10
+
11
+ [REQ-001] User can ingest HTML files
12
+ [REQ-002] User can ingest PDF files
13
+ [REQ-003] User can ingest PPTX files
14
+ [REQ-004] User can ingest XLSX files
15
+ [REQ-005] New formats reuse the existing DocumentConverter abstraction
16
+ [REQ-006] Missing format dependencies produce a clear install message
17
+ [REQ-007] Format readers install as granular optional extras
18
+
19
+ ## Success Metrics
20
+
21
+ - Each supported format converts a sample file to Markdown
22
+ - Adding a format requires no changes to the CLI
23
+ - The ingest extra installs all supported format readers
24
+
25
+ ## Risks
26
+
27
+ - Conversion quality varies by source document
28
+ - PDF text extraction may lose structure
29
+ - A heavier ingest extra increases install size
@@ -33,10 +33,22 @@ Repository = "https://github.com/tcballard/requirements-as-code"
33
33
  Issues = "https://github.com/tcballard/requirements-as-code/issues"
34
34
 
35
35
  [project.optional-dependencies]
36
- # Document ingestion (rac ingest). Optional so the core install stays light.
37
- # markitdown[docx] pulls the DOCX reader; later formats add their own extras.
38
- ingest = ["markitdown[docx]"]
39
- dev = ["pytest>=7.0", "markitdown[docx]", "python-docx"]
36
+ # Document ingestion (rac ingest). Optional so the core install stays light, and
37
+ # split by format so users only pull the readers they need. HTML and Markdown
38
+ # need no extra (HTML is built into MarkItDown; Markdown is pass-through).
39
+ ingest = ["markitdown[docx]"] # DOCX (+ HTML, Markdown)
40
+ ingest-pdf = ["markitdown[pdf]"] # + PDF
41
+ ingest-office = ["markitdown[pptx,xlsx,xls]"] # + PPTX / XLSX / XLS
42
+ ingest-all = ["markitdown[docx,pdf,pptx,xlsx,xls]"] # every supported format
43
+ # dev also pulls the libraries used to *generate* fixture files in the tests.
44
+ dev = [
45
+ "pytest>=7.0",
46
+ "markitdown[docx,pdf,pptx,xlsx,xls]",
47
+ "python-docx",
48
+ "python-pptx",
49
+ "openpyxl",
50
+ "reportlab",
51
+ ]
40
52
 
41
53
  [project.scripts]
42
54
  rac = "rac.cli:main"
@@ -173,7 +173,7 @@ def build_parser() -> argparse.ArgumentParser:
173
173
 
174
174
  p_ingest = sub.add_parser(
175
175
  "ingest",
176
- help="Convert a source document (DOCX, Markdown) to Markdown.",
176
+ help="Convert a document (DOCX, PDF, HTML, PPTX, XLSX, Markdown) to Markdown.",
177
177
  parents=[version_parent],
178
178
  )
179
179
  p_ingest.add_argument("file", help="Path to the source document.")
@@ -67,23 +67,62 @@ class MarkItDownConverter:
67
67
  """
68
68
 
69
69
  name = "markitdown"
70
- extensions = (".docx",) # v0.3.x will extend: .html, .pdf, ...
70
+ # HTML needs no extra (built into MarkItDown); the others come from the
71
+ # corresponding markitdown extras, exposed as our granular ingest extras.
72
+ extensions = (".docx", ".pdf", ".html", ".htm", ".pptx", ".xls", ".xlsx")
71
73
 
72
74
  def convert(self, path: Path) -> str:
73
75
  try:
74
76
  from markitdown import MarkItDown
75
77
  except ModuleNotFoundError as exc:
76
- raise UnsupportedDocument(
77
- f"converting '{path.suffix}' needs the ingest extra: "
78
- "pip install 'requirements-as-code[ingest]'"
79
- ) from exc
78
+ raise UnsupportedDocument(_missing_extra_message(path.suffix)) from exc
79
+
80
80
  try:
81
81
  result = MarkItDown().convert(str(path))
82
82
  except Exception as exc: # MarkItDown raises a variety of errors
83
+ if _is_missing_dependency(exc):
84
+ # MarkItDown is installed but this format's reader extra isn't.
85
+ raise UnsupportedDocument(_missing_extra_message(path.suffix)) from exc
83
86
  raise ConversionError(f"could not convert {path.name}: {exc}") from exc
84
87
  return result.text_content
85
88
 
86
89
 
90
+ # Which optional extra provides the reader for a given file type. HTML/HTM need
91
+ # no extra (built into MarkItDown), so they fall back to the base `ingest`.
92
+ _EXTRA_FOR_SUFFIX = {
93
+ ".docx": "ingest",
94
+ ".pdf": "ingest-pdf",
95
+ ".pptx": "ingest-office",
96
+ ".xls": "ingest-office",
97
+ ".xlsx": "ingest-office",
98
+ }
99
+
100
+
101
+ def _missing_extra_message(suffix: str) -> str:
102
+ extra = _EXTRA_FOR_SUFFIX.get(suffix.lower(), "ingest")
103
+ return (
104
+ f"converting '{suffix}' needs the {extra} extra: "
105
+ f"pip install 'requirements-as-code[{extra}]'"
106
+ )
107
+
108
+
109
+ def _is_missing_dependency(exc: Exception) -> bool:
110
+ """True if ``exc`` is (or wraps) a MarkItDown missing-dependency error."""
111
+ try:
112
+ from markitdown._exceptions import MissingDependencyException
113
+ except Exception: # pragma: no cover - defensive
114
+ return False
115
+ if isinstance(exc, MissingDependencyException):
116
+ return True
117
+ # MarkItDown wraps converter failures in FileConversionException(attempts=...),
118
+ # each attempt carrying the original error in .exc_info = (type, value, tb).
119
+ for attempt in getattr(exc, "attempts", None) or []:
120
+ info = getattr(attempt, "exc_info", None)
121
+ if info and isinstance(info[1], MissingDependencyException):
122
+ return True
123
+ return False
124
+
125
+
87
126
  # Registry — first converter whose extensions match wins. Order is not currently
88
127
  # significant since extension sets are disjoint, but kept explicit for clarity.
89
128
  _CONVERTERS: list[DocumentConverter] = [MarkdownConverter(), MarkItDownConverter()]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: requirements-as-code
3
- Version: 0.3.0
3
+ Version: 0.3.1
4
4
  Summary: RAC — lint and diff product requirements written in Markdown.
5
5
  Author: tcballard
6
6
  License-Expression: MIT
@@ -23,10 +23,19 @@ License-File: LICENSE
23
23
  Requires-Dist: markdown-it-py>=3.0
24
24
  Provides-Extra: ingest
25
25
  Requires-Dist: markitdown[docx]; extra == "ingest"
26
+ Provides-Extra: ingest-pdf
27
+ Requires-Dist: markitdown[pdf]; extra == "ingest-pdf"
28
+ Provides-Extra: ingest-office
29
+ Requires-Dist: markitdown[pptx,xls,xlsx]; extra == "ingest-office"
30
+ Provides-Extra: ingest-all
31
+ Requires-Dist: markitdown[docx,pdf,pptx,xls,xlsx]; extra == "ingest-all"
26
32
  Provides-Extra: dev
27
33
  Requires-Dist: pytest>=7.0; extra == "dev"
28
- Requires-Dist: markitdown[docx]; extra == "dev"
34
+ Requires-Dist: markitdown[docx,pdf,pptx,xls,xlsx]; extra == "dev"
29
35
  Requires-Dist: python-docx; extra == "dev"
36
+ Requires-Dist: python-pptx; extra == "dev"
37
+ Requires-Dist: openpyxl; extra == "dev"
38
+ Requires-Dist: reportlab; extra == "dev"
30
39
  Dynamic: license-file
31
40
 
32
41
  # RAC (Requirements-as-Code)
@@ -436,15 +445,25 @@ rac ingest spec.docx --json # { source, converter, output, markdown }
436
445
  ```
437
446
 
438
447
  Conversion is powered by [MarkItDown](https://github.com/microsoft/markitdown),
439
- installed via the optional `ingest` extra:
448
+ installed via optional extras — split by format so you only pull the readers you
449
+ need:
450
+
451
+ | Extra | Adds | Formats |
452
+ |-------|------|---------|
453
+ | `ingest` | `markitdown[docx]` | DOCX, HTML, Markdown |
454
+ | `ingest-pdf` | `markitdown[pdf]` | + PDF |
455
+ | `ingest-office` | `markitdown[pptx,xlsx,xls]` | + PPTX, XLSX, XLS |
456
+ | `ingest-all` | everything above | all supported formats |
440
457
 
441
458
  ```bash
442
- pip install "requirements-as-code[ingest]"
459
+ pip install "requirements-as-code[ingest]" # DOCX + HTML + Markdown
460
+ pip install "requirements-as-code[ingest-all]" # everything
443
461
  ```
444
462
 
445
- Supported today: **DOCX** and **Markdown** (pass-through). HTML and PDF are
446
- planned for v0.3.x. Converters live behind a `DocumentConverter` abstraction, so
447
- new sources can be added without changing the CLI.
463
+ HTML and Markdown need no extra (HTML is built into MarkItDown; Markdown is a
464
+ pass-through). If a file's reader isn't installed, `rac ingest` tells you exactly
465
+ which extra to install. Converters live behind a `DocumentConverter` abstraction,
466
+ so new sources can be added without changing the CLI.
448
467
 
449
468
  `ingest` exits `0` on success, `1` if a recognized document fails to convert, and
450
469
  `2` for usage errors (file not found, unsupported type, missing `ingest` extra,
@@ -23,6 +23,7 @@ planning/future/v1.4-claude-skills.md
23
23
  planning/future/v1.4-python-sdk.md
24
24
  planning/roadmap/v0.2-stats.md
25
25
  planning/roadmap/v0.3-ingest.md
26
+ planning/roadmap/v0.3.1-formats.md
26
27
  planning/roadmap/v0.4-inspect.md
27
28
  planning/roadmap/v0.5-decisions.md
28
29
  planning/roadmap/v0.6-roadmaps.md
@@ -0,0 +1,21 @@
1
+ markdown-it-py>=3.0
2
+
3
+ [dev]
4
+ pytest>=7.0
5
+ markitdown[docx,pdf,pptx,xls,xlsx]
6
+ python-docx
7
+ python-pptx
8
+ openpyxl
9
+ reportlab
10
+
11
+ [ingest]
12
+ markitdown[docx]
13
+
14
+ [ingest-all]
15
+ markitdown[docx,pdf,pptx,xls,xlsx]
16
+
17
+ [ingest-office]
18
+ markitdown[pptx,xls,xlsx]
19
+
20
+ [ingest-pdf]
21
+ markitdown[pdf]
@@ -26,13 +26,14 @@ from conftest import fixture_path
26
26
  def test_converter_selection_by_extension():
27
27
  assert isinstance(converter_for(Path("a.md")), MarkdownConverter)
28
28
  assert isinstance(converter_for(Path("a.markdown")), MarkdownConverter)
29
- assert isinstance(converter_for(Path("a.docx")), MarkItDownConverter)
29
+ for ext in (".docx", ".pdf", ".html", ".htm", ".pptx", ".xls", ".xlsx"):
30
+ assert isinstance(converter_for(Path(f"a{ext}")), MarkItDownConverter), ext
30
31
  assert converter_for(Path("a.txt")) is None
31
32
 
32
33
 
33
34
  def test_supported_extensions():
34
- assert ".docx" in supported_extensions()
35
- assert ".md" in supported_extensions()
35
+ exts = set(supported_extensions())
36
+ assert {".md", ".docx", ".pdf", ".html", ".htm", ".pptx", ".xls", ".xlsx"} <= exts
36
37
 
37
38
 
38
39
  def test_markdown_passthrough():
@@ -48,6 +49,16 @@ def test_unsupported_type_raises(tmp_path):
48
49
  ingest(str(bad))
49
50
 
50
51
 
52
+ def test_missing_extra_message_points_at_right_extra():
53
+ from rac.ingest import _missing_extra_message
54
+
55
+ assert "[ingest-pdf]" in _missing_extra_message(".pdf")
56
+ assert "[ingest-office]" in _missing_extra_message(".pptx")
57
+ assert "[ingest-office]" in _missing_extra_message(".xlsx")
58
+ assert "[ingest]" in _missing_extra_message(".docx")
59
+ assert "[ingest]" in _missing_extra_message(".html") # html -> base markitdown
60
+
61
+
51
62
  # --- CLI: markdown / error paths (no optional deps needed) ------------------
52
63
 
53
64
 
@@ -127,3 +138,75 @@ def test_cli_ingest_docx_to_file(tmp_path):
127
138
  out = tmp_path / "spec.md"
128
139
  assert main(["ingest", str(src), "-o", str(out)]) == 0
129
140
  assert "# Bond Dashboard" in out.read_text()
141
+
142
+
143
+ # --- Other formats added in v0.3.1 ------------------------------------------
144
+
145
+
146
+ def test_html_conversion(tmp_path):
147
+ # HTML needs no extra dependency (built into MarkItDown).
148
+ src = tmp_path / "page.html"
149
+ src.write_text("<h1>Bond Dashboard</h1><p>Rate exposure.</p>")
150
+ result = ingest(str(src))
151
+ assert result.converter == "markitdown"
152
+ assert "# Bond Dashboard" in result.markdown
153
+
154
+
155
+ def test_pptx_conversion(tmp_path):
156
+ pptx = pytest.importorskip("pptx")
157
+ src = tmp_path / "deck.pptx"
158
+ prs = pptx.Presentation()
159
+ slide = prs.slides.add_slide(prs.slide_layouts[5])
160
+ slide.shapes.title.text = "Deck Title"
161
+ prs.save(str(src))
162
+ result = ingest(str(src))
163
+ assert result.converter == "markitdown"
164
+ assert "Deck Title" in result.markdown
165
+
166
+
167
+ def test_xlsx_conversion(tmp_path):
168
+ openpyxl = pytest.importorskip("openpyxl")
169
+ src = tmp_path / "data.xlsx"
170
+ wb = openpyxl.Workbook()
171
+ ws = wb.active
172
+ ws["A1"] = "Metric"
173
+ ws["A2"] = "MAU"
174
+ wb.save(str(src))
175
+ result = ingest(str(src))
176
+ assert result.converter == "markitdown"
177
+ assert "Metric" in result.markdown
178
+
179
+
180
+ def test_pdf_conversion(tmp_path):
181
+ pytest.importorskip("reportlab")
182
+ from reportlab.pdfgen import canvas
183
+
184
+ src = tmp_path / "doc.pdf"
185
+ c = canvas.Canvas(str(src))
186
+ c.drawString(72, 720, "PDF heading and body text.")
187
+ c.save()
188
+ result = ingest(str(src))
189
+ assert result.converter == "markitdown"
190
+ assert "PDF heading and body text." in result.markdown
191
+
192
+
193
+ def test_missing_dependency_detection():
194
+ # A per-format missing reader should be recognized so the CLI can map it to
195
+ # a clear "install the ingest extra" message (exit 2), direct or wrapped in
196
+ # MarkItDown's FileConversionException(attempts=...).
197
+ from types import SimpleNamespace
198
+
199
+ from markitdown._exceptions import (
200
+ FileConversionException,
201
+ MissingDependencyException,
202
+ )
203
+
204
+ from rac.ingest import _is_missing_dependency
205
+
206
+ assert _is_missing_dependency(MissingDependencyException("x")) is True
207
+ attempt = SimpleNamespace(
208
+ converter=object(),
209
+ exc_info=(MissingDependencyException, MissingDependencyException("y"), None),
210
+ )
211
+ assert _is_missing_dependency(FileConversionException(attempts=[attempt])) is True
212
+ assert _is_missing_dependency(ValueError("unrelated")) is False
@@ -1,9 +0,0 @@
1
- markdown-it-py>=3.0
2
-
3
- [dev]
4
- pytest>=7.0
5
- markitdown[docx]
6
- python-docx
7
-
8
- [ingest]
9
- markitdown[docx]