ingestforge 0.4.0a6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (129) hide show
  1. ingestforge-0.4.0a6/.github/dependabot.yml +10 -0
  2. ingestforge-0.4.0a6/.github/workflows/ci.yml +55 -0
  3. ingestforge-0.4.0a6/.github/workflows/publish.yml +41 -0
  4. ingestforge-0.4.0a6/CHANGELOG.md +45 -0
  5. ingestforge-0.4.0a6/CITATION.cff +24 -0
  6. ingestforge-0.4.0a6/CODE_OF_CONDUCT.md +3 -0
  7. ingestforge-0.4.0a6/CONTRIBUTING.md +7 -0
  8. ingestforge-0.4.0a6/LICENSE +21 -0
  9. ingestforge-0.4.0a6/MANIFEST.in +8 -0
  10. ingestforge-0.4.0a6/PKG-INFO +536 -0
  11. ingestforge-0.4.0a6/README.md +461 -0
  12. ingestforge-0.4.0a6/SECURITY.md +11 -0
  13. ingestforge-0.4.0a6/docs/EXTRACTION_BACKENDS.md +40 -0
  14. ingestforge-0.4.0a6/docs/GENERIC_REST_DESTINATION_CONTRACT.md +25 -0
  15. ingestforge-0.4.0a6/docs/GITHUB_CI_MATRIX.md +19 -0
  16. ingestforge-0.4.0a6/docs/PACKAGING_AND_RELEASE.md +3 -0
  17. ingestforge-0.4.0a6/docs/PRODUCTION_DEPLOYMENT_CHECKLIST.md +30 -0
  18. ingestforge-0.4.0a6/docs/PROVIDER_CONTRACT_MATRIX.md +11 -0
  19. ingestforge-0.4.0a6/docs/PYTHON_3_11_3_14_SUPPORT.md +34 -0
  20. ingestforge-0.4.0a6/docs/RAG_DATASET_CONTRACT.md +3 -0
  21. ingestforge-0.4.0a6/docs/RELEASE_PROVIDER_CHECKLIST.md +13 -0
  22. ingestforge-0.4.0a6/docs/SECURITY_MODEL.md +9 -0
  23. ingestforge-0.4.0a6/docs/release/RELEASE_NOTES_v0_4_0a4.md +13 -0
  24. ingestforge-0.4.0a6/docs/release/RELEASE_NOTES_v0_4_0a5.md +15 -0
  25. ingestforge-0.4.0a6/examples/local_file_ingestion.md +3 -0
  26. ingestforge-0.4.0a6/examples/minimal_python_api.py +5 -0
  27. ingestforge-0.4.0a6/examples/python_api_basic.py +10 -0
  28. ingestforge-0.4.0a6/pyproject.toml +79 -0
  29. ingestforge-0.4.0a6/pytest.ini +2 -0
  30. ingestforge-0.4.0a6/scripts/clean_release_artifacts.py +39 -0
  31. ingestforge-0.4.0a6/scripts/release_hygiene_check.py +61 -0
  32. ingestforge-0.4.0a6/setup.cfg +4 -0
  33. ingestforge-0.4.0a6/src/ingestforge/__init__.py +67 -0
  34. ingestforge-0.4.0a6/src/ingestforge/__main__.py +4 -0
  35. ingestforge-0.4.0a6/src/ingestforge/cli.py +200 -0
  36. ingestforge-0.4.0a6/src/ingestforge/core/config.py +464 -0
  37. ingestforge-0.4.0a6/src/ingestforge/core/contracts.py +311 -0
  38. ingestforge-0.4.0a6/src/ingestforge/core/errors.py +30 -0
  39. ingestforge-0.4.0a6/src/ingestforge/core/languages.py +55 -0
  40. ingestforge-0.4.0a6/src/ingestforge/core/package.py +16 -0
  41. ingestforge-0.4.0a6/src/ingestforge/core/pipeline.py +359 -0
  42. ingestforge-0.4.0a6/src/ingestforge/core/prompts.py +106 -0
  43. ingestforge-0.4.0a6/src/ingestforge/core/provider_doctor.py +205 -0
  44. ingestforge-0.4.0a6/src/ingestforge/core/registry.py +33 -0
  45. ingestforge-0.4.0a6/src/ingestforge/core/validation.py +119 -0
  46. ingestforge-0.4.0a6/src/ingestforge/datasets/__init__.py +0 -0
  47. ingestforge-0.4.0a6/src/ingestforge/datasets/chunker.py +158 -0
  48. ingestforge-0.4.0a6/src/ingestforge/datasets/data_card.py +71 -0
  49. ingestforge-0.4.0a6/src/ingestforge/datasets/rag_export.py +28 -0
  50. ingestforge-0.4.0a6/src/ingestforge/datasets/tokenizers.py +150 -0
  51. ingestforge-0.4.0a6/src/ingestforge/datasets/writer.py +97 -0
  52. ingestforge-0.4.0a6/src/ingestforge/observability/__init__.py +0 -0
  53. ingestforge-0.4.0a6/src/ingestforge/observability/audit_log.py +10 -0
  54. ingestforge-0.4.0a6/src/ingestforge/observability/provenance.py +57 -0
  55. ingestforge-0.4.0a6/src/ingestforge/observability/run_manifest.py +13 -0
  56. ingestforge-0.4.0a6/src/ingestforge/profiles/__init__.py +0 -0
  57. ingestforge-0.4.0a6/src/ingestforge/profiles/dataset_only.yaml +11 -0
  58. ingestforge-0.4.0a6/src/ingestforge/profiles/destination_example.yaml +42 -0
  59. ingestforge-0.4.0a6/src/ingestforge/profiles/examples/deepseek_live.yaml +10 -0
  60. ingestforge-0.4.0a6/src/ingestforge/profiles/examples/gemini_live.yaml +9 -0
  61. ingestforge-0.4.0a6/src/ingestforge/profiles/examples/openai_live.yaml +8 -0
  62. ingestforge-0.4.0a6/src/ingestforge/profiles/examples/strict_live_template.yaml +14 -0
  63. ingestforge-0.4.0a6/src/ingestforge/profiles/manual_safe.yaml +70 -0
  64. ingestforge-0.4.0a6/src/ingestforge/profiles/strict_industrial.yaml +75 -0
  65. ingestforge-0.4.0a6/src/ingestforge/prompts/__init__.py +0 -0
  66. ingestforge-0.4.0a6/src/ingestforge/prompts/article_builder.j2 +4 -0
  67. ingestforge-0.4.0a6/src/ingestforge/prompts/image_ranker.j2 +1 -0
  68. ingestforge-0.4.0a6/src/ingestforge/prompts/reflection_gate.j2 +1 -0
  69. ingestforge-0.4.0a6/src/ingestforge/prompts/translation_qa.j2 +1 -0
  70. ingestforge-0.4.0a6/src/ingestforge/provider_contracts.yaml +29 -0
  71. ingestforge-0.4.0a6/src/ingestforge/providers/ai/__init__.py +12 -0
  72. ingestforge-0.4.0a6/src/ingestforge/providers/ai/base.py +47 -0
  73. ingestforge-0.4.0a6/src/ingestforge/providers/ai/deepseek_provider.py +65 -0
  74. ingestforge-0.4.0a6/src/ingestforge/providers/ai/gemini_provider.py +88 -0
  75. ingestforge-0.4.0a6/src/ingestforge/providers/ai/mock_provider.py +47 -0
  76. ingestforge-0.4.0a6/src/ingestforge/providers/ai/openai_provider.py +71 -0
  77. ingestforge-0.4.0a6/src/ingestforge/providers/ai/schema_repair.py +119 -0
  78. ingestforge-0.4.0a6/src/ingestforge/providers/destination/__init__.py +6 -0
  79. ingestforge-0.4.0a6/src/ingestforge/providers/destination/base.py +11 -0
  80. ingestforge-0.4.0a6/src/ingestforge/providers/destination/generic_rest.py +318 -0
  81. ingestforge-0.4.0a6/src/ingestforge/providers/destination/local_export.py +15 -0
  82. ingestforge-0.4.0a6/src/ingestforge/providers/fetch/__init__.py +0 -0
  83. ingestforge-0.4.0a6/src/ingestforge/providers/fetch/encoding.py +64 -0
  84. ingestforge-0.4.0a6/src/ingestforge/providers/fetch/fetcher.py +101 -0
  85. ingestforge-0.4.0a6/src/ingestforge/providers/fetch/html_extractor.py +138 -0
  86. ingestforge-0.4.0a6/src/ingestforge/providers/fetch/playwright_renderer.py +5 -0
  87. ingestforge-0.4.0a6/src/ingestforge/providers/fetch/robots.py +75 -0
  88. ingestforge-0.4.0a6/src/ingestforge/providers/fetch/safe_url.py +78 -0
  89. ingestforge-0.4.0a6/src/ingestforge/providers/media/__init__.py +0 -0
  90. ingestforge-0.4.0a6/src/ingestforge/providers/media/downloader.py +121 -0
  91. ingestforge-0.4.0a6/src/ingestforge/providers/media/image_candidates.py +31 -0
  92. ingestforge-0.4.0a6/src/ingestforge/providers/media/image_hash.py +29 -0
  93. ingestforge-0.4.0a6/src/ingestforge/providers/media/ocr.py +50 -0
  94. ingestforge-0.4.0a6/src/ingestforge/providers/media/vision_ranker.py +48 -0
  95. ingestforge-0.4.0a6/src/ingestforge/providers/search/__init__.py +12 -0
  96. ingestforge-0.4.0a6/src/ingestforge/providers/search/base.py +44 -0
  97. ingestforge-0.4.0a6/src/ingestforge/providers/search/brave_provider.py +69 -0
  98. ingestforge-0.4.0a6/src/ingestforge/providers/search/firecrawl_provider.py +10 -0
  99. ingestforge-0.4.0a6/src/ingestforge/providers/search/google_cse_provider.py +12 -0
  100. ingestforge-0.4.0a6/src/ingestforge/providers/search/manual_provider.py +38 -0
  101. ingestforge-0.4.0a6/src/ingestforge/providers/search/tavily_provider.py +10 -0
  102. ingestforge-0.4.0a6/src/ingestforge/py.typed +0 -0
  103. ingestforge-0.4.0a6/src/ingestforge/security/__init__.py +0 -0
  104. ingestforge-0.4.0a6/src/ingestforge/security/content_policy.py +48 -0
  105. ingestforge-0.4.0a6/src/ingestforge/security/redaction.py +27 -0
  106. ingestforge-0.4.0a6/src/ingestforge/security/secrets.py +7 -0
  107. ingestforge-0.4.0a6/src/ingestforge.egg-info/PKG-INFO +536 -0
  108. ingestforge-0.4.0a6/src/ingestforge.egg-info/SOURCES.txt +127 -0
  109. ingestforge-0.4.0a6/src/ingestforge.egg-info/dependency_links.txt +1 -0
  110. ingestforge-0.4.0a6/src/ingestforge.egg-info/entry_points.txt +2 -0
  111. ingestforge-0.4.0a6/src/ingestforge.egg-info/requires.txt +61 -0
  112. ingestforge-0.4.0a6/src/ingestforge.egg-info/top_level.txt +1 -0
  113. ingestforge-0.4.0a6/tests/test_a4_hardening.py +207 -0
  114. ingestforge-0.4.0a6/tests/test_a5_hardening.py +129 -0
  115. ingestforge-0.4.0a6/tests/test_a6_doctor_and_contract_matrix.py +175 -0
  116. ingestforge-0.4.0a6/tests/test_a6_language_config_and_template_hardening.py +105 -0
  117. ingestforge-0.4.0a6/tests/test_a6_model_policy.py +87 -0
  118. ingestforge-0.4.0a6/tests/test_a6_prompt_registry_and_cli_polish.py +90 -0
  119. ingestforge-0.4.0a6/tests/test_a6_token_span_chunking.py +48 -0
  120. ingestforge-0.4.0a6/tests/test_config.py +18 -0
  121. ingestforge-0.4.0a6/tests/test_contracts.py +29 -0
  122. ingestforge-0.4.0a6/tests/test_data_card_and_provenance.py +18 -0
  123. ingestforge-0.4.0a6/tests/test_destination.py +102 -0
  124. ingestforge-0.4.0a6/tests/test_extraction_dataset.py +80 -0
  125. ingestforge-0.4.0a6/tests/test_gemini_current_payload.py +49 -0
  126. ingestforge-0.4.0a6/tests/test_pipeline_cli.py +40 -0
  127. ingestforge-0.4.0a6/tests/test_provider_contracts.py +51 -0
  128. ingestforge-0.4.0a6/tests/test_public_api_and_release_metadata.py +56 -0
  129. ingestforge-0.4.0a6/tests/test_safe_url.py +31 -0
@@ -0,0 +1,10 @@
1
+ version: 2
2
+ updates:
3
+ - package-ecosystem: "github-actions"
4
+ directory: "/"
5
+ schedule:
6
+ interval: "weekly"
7
+ - package-ecosystem: "pip"
8
+ directory: "/"
9
+ schedule:
10
+ interval: "weekly"
@@ -0,0 +1,55 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ pull_request:
6
+
7
+ defaults:
8
+ run:
9
+ shell: bash
10
+
11
+ jobs:
12
+ test:
13
+ name: py${{ matrix.python-version }} / ${{ matrix.os }}
14
+ runs-on: ${{ matrix.os }}
15
+ strategy:
16
+ fail-fast: false
17
+ matrix:
18
+ os: [ubuntu-latest, windows-latest, macos-latest]
19
+ python-version: ["3.11", "3.12", "3.13", "3.14"]
20
+ steps:
21
+ - uses: actions/checkout@v4
22
+ - uses: actions/setup-python@v5
23
+ with:
24
+ python-version: ${{ matrix.python-version }}
25
+ cache: pip
26
+ - run: python -m pip install --upgrade pip
27
+ - run: python -m pip install -e ".[dev]"
28
+ - run: python -m ruff check .
29
+ - run: python -m ruff format --check .
30
+ - run: python -m mypy src/ingestforge
31
+ - run: python -m compileall -q src tests
32
+ - run: python -m pytest -q
33
+ - run: ingestforge --help
34
+ - run: ingestforge version
35
+
36
+ package:
37
+ name: package hygiene
38
+ runs-on: ubuntu-latest
39
+ steps:
40
+ - uses: actions/checkout@v4
41
+ - uses: actions/setup-python@v5
42
+ with:
43
+ python-version: "3.14"
44
+ cache: pip
45
+ - run: python -m pip install --upgrade pip
46
+ - run: python -m pip install -e ".[dev]"
47
+ - run: python -m mypy src/ingestforge
48
+ - run: python scripts/clean_release_artifacts.py
49
+ - run: python scripts/release_hygiene_check.py
50
+ - run: python -m pip_audit --skip-editable
51
+ - run: python -m build
52
+ - run: python -m twine check dist/*
53
+ - run: python -m pip install --force-reinstall dist/*.whl
54
+ - run: python -c "import ingestforge; print(ingestforge.__version__)"
55
+ - run: ingestforge --help
@@ -0,0 +1,41 @@
1
+ name: Publish
2
+
3
+ on:
4
+ release:
5
+ types: [published]
6
+
7
+ jobs:
8
+ build:
9
+ runs-on: ubuntu-latest
10
+ steps:
11
+ - uses: actions/checkout@v4
12
+ - uses: actions/setup-python@v5
13
+ with:
14
+ python-version: "3.13"
15
+ cache: pip
16
+ - run: python -m pip install --upgrade pip build twine
17
+ - run: python -m build --sdist --wheel
18
+ - run: python -m twine check dist/*
19
+ - uses: actions/upload-artifact@v4
20
+ with:
21
+ name: dist
22
+ path: dist/*
23
+
24
+ publish:
25
+ needs: build
26
+ runs-on: ubuntu-latest
27
+ environment:
28
+ name: pypi
29
+ url: https://pypi.org/p/ingestforge
30
+ permissions:
31
+ id-token: write
32
+ contents: read
33
+ steps:
34
+ - uses: actions/download-artifact@v4
35
+ with:
36
+ name: dist
37
+ path: dist
38
+ - uses: pypa/gh-action-pypi-publish@release/v1
39
+ with:
40
+ # For TestPyPI, configure repository-url in a separate workflow/environment.
41
+ skip-existing: true
@@ -0,0 +1,45 @@
1
+ # Changelog
2
+
3
+ ## 0.4.0a6
4
+
5
+ - Added optional Trafilatura-backed HTML extraction with `extraction.backend: auto|internal|trafilatura`.
6
+ - Added `ingestforge[extraction]` optional dependency extra.
7
+ - Recorded extraction backend diagnostics and source `extraction_method` in pipeline output.
8
+ - Documented that the alpha claim gate remains exact-match based, not semantic verification.
9
+ - Added config-driven `ai.source_language` and `ai.target_languages` with structural BCP 47-style validation and no provider-language allowlist.
10
+ - Hardened `strict_live_template.yaml` by rejecting empty domain values such as unset `${env:INGESTFORGE_ALLOWED_DOMAIN}`.
11
+ - Removed hard-coded live model defaults from the strict profile; offline defaults use mock/mock.
12
+ - Added generic provider model policy: model IDs are opaque strings and live providers require explicit non-empty models when external AI calls are enabled.
13
+ - Added provider doctor diagnostics for local payload contract validation.
14
+ - Added current/legacy Gemini structured-output payload styles behind explicit config.
15
+ - Made DeepSeek thinking control explicit and added OpenAI-SDK-compatible extra_body helper.
16
+ - Fixed approximate token chunking to slice original text spans for multilingual source preservation.
17
+ - Added provider contract matrix and release provider checklist.
18
+ - Documented SSRF validate-mode DNS-rebinding/TOCTOU limitations honestly.
19
+ - Added a real packaged prompt registry so `ai.prompt_version` resolves to `src/ingestforge/prompts/*.j2` at runtime and unknown prompt versions fail clearly.
20
+ - Polished CLI boolean flags so release help does not expose confusing `--no-no-*` or `--no-offline` aliases.
21
+ - Polished release metadata for PyPI and Zenodo: aligned `CITATION.cff` license with MIT, added ORCID/abstract/keywords, made README license/citation links absolute, and added a PyPI environment to Trusted Publishing.
22
+
23
+ ## 0.4.0a5
24
+
25
+ - Hardened SSRF strict-allowlist semantics and documentation.
26
+ - Added robust HTML byte decoding metadata for multilingual pages.
27
+ - Replaced misleading token/word chunking with explicit tokenizer abstraction.
28
+ - Fixed package hash ordering so dataset records are included before hashing.
29
+ - Made Python API write semantics explicit through `write_dataset`.
30
+ - Added conservative claim-gate statuses and content-policy extension points.
31
+ - Added `.gitignore` and updated release hygiene expectations.
32
+
33
+ ## 0.4.0a4
34
+
35
+ - Support target narrowed and documented as Python 3.11 through 3.14.
36
+ - GitHub CI matrix expanded to Linux, Windows, and macOS across Python 3.11, 3.12, 3.13, and 3.14.
37
+ - Added short public API helpers: `ingestforge.pipeline()` and `ingestforge.ingest_url()`.
38
+ - Updated packaging metadata, Ruff target, MyPy target, docs, examples, release checks, and hygiene rules for public alpha distribution.
39
+ - Removed generated caches and old build artifacts from source delivery.
40
+
41
+ ## 0.4.0a1
42
+
43
+ - First clean public alpha under the IngestForge name.
44
+ - Brand-neutral package/import/CLI layout.
45
+ - Standard package contract, provenance ledger, evidence gate, reflection gate, data card generation, RAG export, and generic REST destination adapter.
@@ -0,0 +1,24 @@
1
+ cff-version: 1.2.0
2
+ message: "If you use IngestForge, please cite it."
3
+ title: "IngestForge: Config-driven AI content ingestion and provenance-aware RAG dataset building"
4
+ authors:
5
+ - family-names: "Jamei"
6
+ given-names: "Parvaz"
7
+ orcid: "https://orcid.org/0009-0002-9980-270X"
8
+ abstract: "IngestForge is a lightweight, config-driven Python library for safe web ingestion, optional Trafilatura-backed extraction, evidence-gated AI article generation, provenance ledgers, RAG dataset chunks, data cards, and configurable multi-provider exports."
9
+ keywords:
10
+ - "AI content ingestion"
11
+ - "RAG"
12
+ - "provenance"
13
+ - "web extraction"
14
+ - "structured output"
15
+ - "audit trail"
16
+ - "dataset generation"
17
+ - "OpenAI"
18
+ - "Gemini"
19
+ - "DeepSeek"
20
+ version: "0.4.0a6"
21
+ date-released: "2026-05-23"
22
+ license: "MIT"
23
+ repository-code: "https://github.com/Parvaz-Jamei/IngestForge"
24
+ type: software
@@ -0,0 +1,3 @@
1
+ # Code of Conduct
2
+
3
+ Be respectful, evidence-focused, and constructive. This project is intended for safe and auditable ingestion workflows.
@@ -0,0 +1,7 @@
1
+ # Contributing
2
+
3
+ 1. Create a virtual environment.
4
+ 2. Install with `pip install -e .[dev]`.
5
+ 3. Run `ruff check .`, `ruff format --check .`, `pytest -q`, and `compileall`.
6
+ 4. Keep public code brand-neutral and config-driven.
7
+ 5. Do not commit secrets or generated run artifacts.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Parvaz Jamei
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,8 @@
1
+ include README.md LICENSE CHANGELOG.md SECURITY.md CONTRIBUTING.md CODE_OF_CONDUCT.md CITATION.cff pyproject.toml pytest.ini
2
+ recursive-include src/ingestforge/prompts *.j2
3
+ recursive-include src/ingestforge/profiles *.yaml
4
+ include src/ingestforge/py.typed
5
+ recursive-include docs *.md
6
+ recursive-include examples *.md *.py
7
+ recursive-include scripts *.py
8
+ recursive-include .github *.yml