ingestforge 0.4.0a6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. ingestforge/__init__.py +67 -0
  2. ingestforge/__main__.py +4 -0
  3. ingestforge/cli.py +200 -0
  4. ingestforge/core/config.py +464 -0
  5. ingestforge/core/contracts.py +311 -0
  6. ingestforge/core/errors.py +30 -0
  7. ingestforge/core/languages.py +55 -0
  8. ingestforge/core/package.py +16 -0
  9. ingestforge/core/pipeline.py +359 -0
  10. ingestforge/core/prompts.py +106 -0
  11. ingestforge/core/provider_doctor.py +205 -0
  12. ingestforge/core/registry.py +33 -0
  13. ingestforge/core/validation.py +119 -0
  14. ingestforge/datasets/__init__.py +0 -0
  15. ingestforge/datasets/chunker.py +158 -0
  16. ingestforge/datasets/data_card.py +71 -0
  17. ingestforge/datasets/rag_export.py +28 -0
  18. ingestforge/datasets/tokenizers.py +150 -0
  19. ingestforge/datasets/writer.py +97 -0
  20. ingestforge/observability/__init__.py +0 -0
  21. ingestforge/observability/audit_log.py +10 -0
  22. ingestforge/observability/provenance.py +57 -0
  23. ingestforge/observability/run_manifest.py +13 -0
  24. ingestforge/profiles/__init__.py +0 -0
  25. ingestforge/profiles/dataset_only.yaml +11 -0
  26. ingestforge/profiles/destination_example.yaml +42 -0
  27. ingestforge/profiles/examples/deepseek_live.yaml +10 -0
  28. ingestforge/profiles/examples/gemini_live.yaml +9 -0
  29. ingestforge/profiles/examples/openai_live.yaml +8 -0
  30. ingestforge/profiles/examples/strict_live_template.yaml +14 -0
  31. ingestforge/profiles/manual_safe.yaml +70 -0
  32. ingestforge/profiles/strict_industrial.yaml +75 -0
  33. ingestforge/prompts/__init__.py +0 -0
  34. ingestforge/prompts/article_builder.j2 +4 -0
  35. ingestforge/prompts/image_ranker.j2 +1 -0
  36. ingestforge/prompts/reflection_gate.j2 +1 -0
  37. ingestforge/prompts/translation_qa.j2 +1 -0
  38. ingestforge/provider_contracts.yaml +29 -0
  39. ingestforge/providers/ai/__init__.py +12 -0
  40. ingestforge/providers/ai/base.py +47 -0
  41. ingestforge/providers/ai/deepseek_provider.py +65 -0
  42. ingestforge/providers/ai/gemini_provider.py +88 -0
  43. ingestforge/providers/ai/mock_provider.py +47 -0
  44. ingestforge/providers/ai/openai_provider.py +71 -0
  45. ingestforge/providers/ai/schema_repair.py +119 -0
  46. ingestforge/providers/destination/__init__.py +6 -0
  47. ingestforge/providers/destination/base.py +11 -0
  48. ingestforge/providers/destination/generic_rest.py +318 -0
  49. ingestforge/providers/destination/local_export.py +15 -0
  50. ingestforge/providers/fetch/__init__.py +0 -0
  51. ingestforge/providers/fetch/encoding.py +64 -0
  52. ingestforge/providers/fetch/fetcher.py +101 -0
  53. ingestforge/providers/fetch/html_extractor.py +138 -0
  54. ingestforge/providers/fetch/playwright_renderer.py +5 -0
  55. ingestforge/providers/fetch/robots.py +75 -0
  56. ingestforge/providers/fetch/safe_url.py +78 -0
  57. ingestforge/providers/media/__init__.py +0 -0
  58. ingestforge/providers/media/downloader.py +121 -0
  59. ingestforge/providers/media/image_candidates.py +31 -0
  60. ingestforge/providers/media/image_hash.py +29 -0
  61. ingestforge/providers/media/ocr.py +50 -0
  62. ingestforge/providers/media/vision_ranker.py +48 -0
  63. ingestforge/providers/search/__init__.py +12 -0
  64. ingestforge/providers/search/base.py +44 -0
  65. ingestforge/providers/search/brave_provider.py +69 -0
  66. ingestforge/providers/search/firecrawl_provider.py +10 -0
  67. ingestforge/providers/search/google_cse_provider.py +12 -0
  68. ingestforge/providers/search/manual_provider.py +38 -0
  69. ingestforge/providers/search/tavily_provider.py +10 -0
  70. ingestforge/py.typed +0 -0
  71. ingestforge/security/__init__.py +0 -0
  72. ingestforge/security/content_policy.py +48 -0
  73. ingestforge/security/redaction.py +27 -0
  74. ingestforge/security/secrets.py +7 -0
  75. ingestforge-0.4.0a6.dist-info/METADATA +536 -0
  76. ingestforge-0.4.0a6.dist-info/RECORD +80 -0
  77. ingestforge-0.4.0a6.dist-info/WHEEL +5 -0
  78. ingestforge-0.4.0a6.dist-info/entry_points.txt +2 -0
  79. ingestforge-0.4.0a6.dist-info/licenses/LICENSE +21 -0
  80. ingestforge-0.4.0a6.dist-info/top_level.txt +1 -0
@@ -0,0 +1,67 @@
1
+ from __future__ import annotations
2
+
3
+ from importlib.metadata import PackageNotFoundError, version
4
+ from pathlib import Path
5
+ from typing import Any
6
+
7
+ try:
8
+ __version__ = version("ingestforge")
9
+ except PackageNotFoundError:
10
+ __version__ = "0.4.0a6"
11
+
12
+ from ingestforge.core.config import IngestForgeProfile, load_profile
13
+ from ingestforge.core.contracts import StandardPackage
14
+ from ingestforge.core.pipeline import IngestPipeline
15
+
16
+
17
+ def pipeline(
18
+ profile: str | Path | IngestForgeProfile = "profiles/manual_safe.yaml",
19
+ *,
20
+ overrides: dict[str, Any] | None = None,
21
+ ) -> IngestPipeline:
22
+ """Create an :class:`IngestPipeline` from a profile path or profile object.
23
+
24
+ This is the shortest stable public API for library users who do not need to
25
+ manually instantiate the pipeline class.
26
+ """
27
+ loaded = (
28
+ profile
29
+ if isinstance(profile, IngestForgeProfile)
30
+ else load_profile(profile, overrides=overrides)
31
+ )
32
+ return IngestPipeline.from_profile(loaded)
33
+
34
+
35
+ def ingest_url(
36
+ url: str,
37
+ profile: str | Path | IngestForgeProfile = "profiles/manual_safe.yaml",
38
+ *,
39
+ runs_dir: str | Path = "runs",
40
+ dry_run: bool | None = None,
41
+ write_dataset: bool | None = None,
42
+ external_calls: str | None = None,
43
+ overrides: dict[str, Any] | None = None,
44
+ ) -> StandardPackage:
45
+ """Ingest one URL and return a validated standard package.
46
+
47
+ Python callers can explicitly disable writing with ``write_dataset=False``.
48
+ CLI profiles may still write by default.
49
+ """
50
+ return pipeline(profile, overrides=overrides).ingest_url(
51
+ url,
52
+ dry_run=dry_run,
53
+ runs_dir=runs_dir,
54
+ write_dataset=write_dataset,
55
+ external_calls=external_calls,
56
+ )
57
+
58
+
59
+ __all__ = [
60
+ "IngestPipeline",
61
+ "IngestForgeProfile",
62
+ "StandardPackage",
63
+ "ingest_url",
64
+ "load_profile",
65
+ "pipeline",
66
+ "__version__",
67
+ ]
@@ -0,0 +1,4 @@
1
+ from ingestforge.cli import app
2
+
3
+ if __name__ == "__main__":
4
+ app()
ingestforge/cli.py ADDED
@@ -0,0 +1,200 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import sys
5
+ from importlib import resources
6
+ from pathlib import Path
7
+
8
+ import typer
9
+
10
+ from ingestforge import __version__
11
+ from ingestforge.core.config import load_profile, validate_profile
12
+ from ingestforge.core.package import read_package
13
+ from ingestforge.core.pipeline import IngestPipeline
14
+ from ingestforge.core.provider_doctor import run_provider_doctor
15
+ from ingestforge.datasets.rag_export import export_jsonl
16
+ from ingestforge.providers.destination.generic_rest import GenericRestDestination
17
+ from ingestforge.providers.destination.local_export import LocalExportDestination
18
+
19
+ app = typer.Typer(help="Config-driven AI content ingestion and RAG dataset library.")
20
+ doctor_app = typer.Typer(help="Local and optional live diagnostics for provider configuration.")
21
+ DEFAULT_PROFILE_ARG = typer.Argument(Path("profiles/manual_safe.yaml"))
22
+ DEFAULT_PROFILE_OPT = typer.Option(Path("profiles/manual_safe.yaml"))
23
+ DESTINATION_PROFILE_OPT = typer.Option(Path("profiles/destination_example.yaml"))
24
+
25
+
26
+ @app.command()
27
+ def init(target: Path = DEFAULT_PROFILE_ARG):
28
+ """Create a starter profile file for a new project."""
29
+ target.parent.mkdir(parents=True, exist_ok=True)
30
+ if not target.exists():
31
+ text = (resources.files("ingestforge.profiles") / "manual_safe.yaml").read_text(
32
+ encoding="utf-8"
33
+ )
34
+ target.write_text(text, encoding="utf-8")
35
+ created = True
36
+ else:
37
+ created = False
38
+ typer.echo(
39
+ json.dumps(
40
+ {
41
+ "version": __version__,
42
+ "profile": str(target),
43
+ "created": created,
44
+ "next": f"ingestforge ingest-url https://example.com/article --profile {target}",
45
+ },
46
+ indent=2,
47
+ )
48
+ )
49
+
50
+
51
+ @app.command("validate-profile")
52
+ def validate_profile_cmd(profile: Path):
53
+ p = validate_profile(profile)
54
+ typer.echo(p.model_dump_json(indent=2))
55
+
56
+
57
+ @app.command("ingest-url")
58
+ def ingest_url(
59
+ url: str,
60
+ profile: Path = DEFAULT_PROFILE_OPT,
61
+ dry_run: bool = True,
62
+ runs_dir: Path = Path("runs"),
63
+ no_external_calls: bool = typer.Option(
64
+ False, "--no-external-calls", help="Disable all external AI/search calls for this run."
65
+ ),
66
+ external_calls: str | None = typer.Option(None, "--external-calls"),
67
+ live: bool = typer.Option(
68
+ False, "--live", help="Enable live external calls according to the profile."
69
+ ),
70
+ no_live: bool = typer.Option(False, "--no-live", help="Force external calls to disabled."),
71
+ write_dataset: bool = True,
72
+ ):
73
+ effective_external_calls = external_calls
74
+ if no_external_calls or no_live:
75
+ effective_external_calls = "disabled"
76
+ elif live:
77
+ effective_external_calls = "enabled"
78
+ pipeline_overrides: dict[str, object] = {"dry_run": dry_run, "write_dataset": write_dataset}
79
+ if effective_external_calls is not None:
80
+ pipeline_overrides["external_calls"] = effective_external_calls
81
+ p = load_profile(
82
+ profile,
83
+ overrides={"pipeline": pipeline_overrides},
84
+ )
85
+ package = IngestPipeline.from_profile(p).ingest_url(
86
+ url,
87
+ dry_run=dry_run,
88
+ runs_dir=runs_dir,
89
+ write_dataset=write_dataset,
90
+ external_calls=effective_external_calls,
91
+ )
92
+ typer.echo(
93
+ json.dumps(
94
+ {
95
+ "job_id": package.job_id,
96
+ "package": str(runs_dir / package.job_id / "package.json"),
97
+ "valid": package.validation_report.is_valid,
98
+ },
99
+ indent=2,
100
+ )
101
+ )
102
+
103
+
104
+ @app.command("search-topic")
105
+ def search_topic(
106
+ query: str,
107
+ profile: Path = DEFAULT_PROFILE_OPT,
108
+ dry_run: bool = True,
109
+ no_external_calls: bool = typer.Option(
110
+ False, "--no-external-calls", help="Disable all external AI/search calls for this run."
111
+ ),
112
+ external_calls: str | None = typer.Option(None, "--external-calls"),
113
+ live: bool = typer.Option(
114
+ False, "--live", help="Enable live external calls according to the profile."
115
+ ),
116
+ no_live: bool = typer.Option(False, "--no-live", help="Force external calls to disabled."),
117
+ ):
118
+ effective_external_calls = external_calls
119
+ if no_external_calls or no_live:
120
+ effective_external_calls = "disabled"
121
+ elif live:
122
+ effective_external_calls = "enabled"
123
+ pipeline_overrides: dict[str, object] = {"dry_run": dry_run}
124
+ if effective_external_calls is not None:
125
+ pipeline_overrides["external_calls"] = effective_external_calls
126
+ p = load_profile(profile, overrides={"pipeline": pipeline_overrides})
127
+ urls = IngestPipeline.from_profile(p).search_topic(
128
+ query, external_calls=effective_external_calls
129
+ )
130
+ typer.echo(json.dumps({"urls": urls}, indent=2))
131
+
132
+
133
+ @app.command("build-dataset")
134
+ def build_dataset(run_dir: Path):
135
+ package = read_package(run_dir / "package.json")
136
+ package.write_dataset(run_dir.parent)
137
+ typer.echo(str(run_dir))
138
+
139
+
140
+ @app.command("validate-package")
141
+ def validate_package(path: Path):
142
+ package = read_package(path)
143
+ report = package.validate_package()
144
+ typer.echo(report.model_dump_json(indent=2))
145
+ raise typer.Exit(0 if report.is_valid else 1)
146
+
147
+
148
+ @app.command("export-rag")
149
+ def export_rag(run_dir: Path):
150
+ out = export_jsonl(run_dir)
151
+ typer.echo(str(out))
152
+
153
+
154
+ @app.command()
155
+ def publish(run_dir: Path, profile: Path = DESTINATION_PROFILE_OPT):
156
+ package = read_package(run_dir / "package.json")
157
+ p = load_profile(profile)
158
+ if p.destination.provider == "local_export":
159
+ result = LocalExportDestination(run_dir.parent).publish(package)
160
+ else:
161
+ result = GenericRestDestination(p.destination).publish(package)
162
+ typer.echo(json.dumps(result, indent=2, ensure_ascii=False))
163
+
164
+
165
+ @doctor_app.callback(invoke_without_command=True)
166
+ def doctor_summary(ctx: typer.Context, profile: Path = DEFAULT_PROFILE_OPT):
167
+ if ctx.invoked_subcommand is not None:
168
+ return
169
+ p = load_profile(profile)
170
+ info = {
171
+ "python": sys.version.split()[0],
172
+ "profile": p.profile_name,
173
+ "ai_provider": p.ai.provider,
174
+ "destination": p.destination.provider,
175
+ "dry_run": p.pipeline.dry_run,
176
+ }
177
+ typer.echo(json.dumps(info, indent=2))
178
+
179
+
180
+ @doctor_app.command("providers")
181
+ def doctor_providers(
182
+ profile: Path = DEFAULT_PROFILE_OPT,
183
+ offline: bool = typer.Option(
184
+ False, "--offline", help="Run local payload/schema checks only; this overrides --live."
185
+ ),
186
+ live: bool = typer.Option(
187
+ False, "--live", help="Run optional live smoke when env opt-in and credentials exist."
188
+ ),
189
+ ):
190
+ p = load_profile(profile)
191
+ result = run_provider_doctor(p, live=live and not offline)
192
+ typer.echo(json.dumps(result, indent=2, ensure_ascii=False))
193
+
194
+
195
+ app.add_typer(doctor_app, name="doctor")
196
+
197
+
198
+ @app.command()
199
+ def version():
200
+ typer.echo(__version__)