ingestforge 0.4.0a6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ingestforge/__init__.py +67 -0
- ingestforge/__main__.py +4 -0
- ingestforge/cli.py +200 -0
- ingestforge/core/config.py +464 -0
- ingestforge/core/contracts.py +311 -0
- ingestforge/core/errors.py +30 -0
- ingestforge/core/languages.py +55 -0
- ingestforge/core/package.py +16 -0
- ingestforge/core/pipeline.py +359 -0
- ingestforge/core/prompts.py +106 -0
- ingestforge/core/provider_doctor.py +205 -0
- ingestforge/core/registry.py +33 -0
- ingestforge/core/validation.py +119 -0
- ingestforge/datasets/__init__.py +0 -0
- ingestforge/datasets/chunker.py +158 -0
- ingestforge/datasets/data_card.py +71 -0
- ingestforge/datasets/rag_export.py +28 -0
- ingestforge/datasets/tokenizers.py +150 -0
- ingestforge/datasets/writer.py +97 -0
- ingestforge/observability/__init__.py +0 -0
- ingestforge/observability/audit_log.py +10 -0
- ingestforge/observability/provenance.py +57 -0
- ingestforge/observability/run_manifest.py +13 -0
- ingestforge/profiles/__init__.py +0 -0
- ingestforge/profiles/dataset_only.yaml +11 -0
- ingestforge/profiles/destination_example.yaml +42 -0
- ingestforge/profiles/examples/deepseek_live.yaml +10 -0
- ingestforge/profiles/examples/gemini_live.yaml +9 -0
- ingestforge/profiles/examples/openai_live.yaml +8 -0
- ingestforge/profiles/examples/strict_live_template.yaml +14 -0
- ingestforge/profiles/manual_safe.yaml +70 -0
- ingestforge/profiles/strict_industrial.yaml +75 -0
- ingestforge/prompts/__init__.py +0 -0
- ingestforge/prompts/article_builder.j2 +4 -0
- ingestforge/prompts/image_ranker.j2 +1 -0
- ingestforge/prompts/reflection_gate.j2 +1 -0
- ingestforge/prompts/translation_qa.j2 +1 -0
- ingestforge/provider_contracts.yaml +29 -0
- ingestforge/providers/ai/__init__.py +12 -0
- ingestforge/providers/ai/base.py +47 -0
- ingestforge/providers/ai/deepseek_provider.py +65 -0
- ingestforge/providers/ai/gemini_provider.py +88 -0
- ingestforge/providers/ai/mock_provider.py +47 -0
- ingestforge/providers/ai/openai_provider.py +71 -0
- ingestforge/providers/ai/schema_repair.py +119 -0
- ingestforge/providers/destination/__init__.py +6 -0
- ingestforge/providers/destination/base.py +11 -0
- ingestforge/providers/destination/generic_rest.py +318 -0
- ingestforge/providers/destination/local_export.py +15 -0
- ingestforge/providers/fetch/__init__.py +0 -0
- ingestforge/providers/fetch/encoding.py +64 -0
- ingestforge/providers/fetch/fetcher.py +101 -0
- ingestforge/providers/fetch/html_extractor.py +138 -0
- ingestforge/providers/fetch/playwright_renderer.py +5 -0
- ingestforge/providers/fetch/robots.py +75 -0
- ingestforge/providers/fetch/safe_url.py +78 -0
- ingestforge/providers/media/__init__.py +0 -0
- ingestforge/providers/media/downloader.py +121 -0
- ingestforge/providers/media/image_candidates.py +31 -0
- ingestforge/providers/media/image_hash.py +29 -0
- ingestforge/providers/media/ocr.py +50 -0
- ingestforge/providers/media/vision_ranker.py +48 -0
- ingestforge/providers/search/__init__.py +12 -0
- ingestforge/providers/search/base.py +44 -0
- ingestforge/providers/search/brave_provider.py +69 -0
- ingestforge/providers/search/firecrawl_provider.py +10 -0
- ingestforge/providers/search/google_cse_provider.py +12 -0
- ingestforge/providers/search/manual_provider.py +38 -0
- ingestforge/providers/search/tavily_provider.py +10 -0
- ingestforge/py.typed +0 -0
- ingestforge/security/__init__.py +0 -0
- ingestforge/security/content_policy.py +48 -0
- ingestforge/security/redaction.py +27 -0
- ingestforge/security/secrets.py +7 -0
- ingestforge-0.4.0a6.dist-info/METADATA +536 -0
- ingestforge-0.4.0a6.dist-info/RECORD +80 -0
- ingestforge-0.4.0a6.dist-info/WHEEL +5 -0
- ingestforge-0.4.0a6.dist-info/entry_points.txt +2 -0
- ingestforge-0.4.0a6.dist-info/licenses/LICENSE +21 -0
- ingestforge-0.4.0a6.dist-info/top_level.txt +1 -0
ingestforge/__init__.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from importlib.metadata import PackageNotFoundError, version
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
try:
|
|
8
|
+
__version__ = version("ingestforge")
|
|
9
|
+
except PackageNotFoundError:
|
|
10
|
+
__version__ = "0.4.0a6"
|
|
11
|
+
|
|
12
|
+
from ingestforge.core.config import IngestForgeProfile, load_profile
|
|
13
|
+
from ingestforge.core.contracts import StandardPackage
|
|
14
|
+
from ingestforge.core.pipeline import IngestPipeline
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def pipeline(
|
|
18
|
+
profile: str | Path | IngestForgeProfile = "profiles/manual_safe.yaml",
|
|
19
|
+
*,
|
|
20
|
+
overrides: dict[str, Any] | None = None,
|
|
21
|
+
) -> IngestPipeline:
|
|
22
|
+
"""Create an :class:`IngestPipeline` from a profile path or profile object.
|
|
23
|
+
|
|
24
|
+
This is the shortest stable public API for library users who do not need to
|
|
25
|
+
manually instantiate the pipeline class.
|
|
26
|
+
"""
|
|
27
|
+
loaded = (
|
|
28
|
+
profile
|
|
29
|
+
if isinstance(profile, IngestForgeProfile)
|
|
30
|
+
else load_profile(profile, overrides=overrides)
|
|
31
|
+
)
|
|
32
|
+
return IngestPipeline.from_profile(loaded)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def ingest_url(
|
|
36
|
+
url: str,
|
|
37
|
+
profile: str | Path | IngestForgeProfile = "profiles/manual_safe.yaml",
|
|
38
|
+
*,
|
|
39
|
+
runs_dir: str | Path = "runs",
|
|
40
|
+
dry_run: bool | None = None,
|
|
41
|
+
write_dataset: bool | None = None,
|
|
42
|
+
external_calls: str | None = None,
|
|
43
|
+
overrides: dict[str, Any] | None = None,
|
|
44
|
+
) -> StandardPackage:
|
|
45
|
+
"""Ingest one URL and return a validated standard package.
|
|
46
|
+
|
|
47
|
+
Python callers can explicitly disable writing with ``write_dataset=False``.
|
|
48
|
+
CLI profiles may still write by default.
|
|
49
|
+
"""
|
|
50
|
+
return pipeline(profile, overrides=overrides).ingest_url(
|
|
51
|
+
url,
|
|
52
|
+
dry_run=dry_run,
|
|
53
|
+
runs_dir=runs_dir,
|
|
54
|
+
write_dataset=write_dataset,
|
|
55
|
+
external_calls=external_calls,
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
__all__ = [
|
|
60
|
+
"IngestPipeline",
|
|
61
|
+
"IngestForgeProfile",
|
|
62
|
+
"StandardPackage",
|
|
63
|
+
"ingest_url",
|
|
64
|
+
"load_profile",
|
|
65
|
+
"pipeline",
|
|
66
|
+
"__version__",
|
|
67
|
+
]
|
ingestforge/__main__.py
ADDED
ingestforge/cli.py
ADDED
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import sys
|
|
5
|
+
from importlib import resources
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
import typer
|
|
9
|
+
|
|
10
|
+
from ingestforge import __version__
|
|
11
|
+
from ingestforge.core.config import load_profile, validate_profile
|
|
12
|
+
from ingestforge.core.package import read_package
|
|
13
|
+
from ingestforge.core.pipeline import IngestPipeline
|
|
14
|
+
from ingestforge.core.provider_doctor import run_provider_doctor
|
|
15
|
+
from ingestforge.datasets.rag_export import export_jsonl
|
|
16
|
+
from ingestforge.providers.destination.generic_rest import GenericRestDestination
|
|
17
|
+
from ingestforge.providers.destination.local_export import LocalExportDestination
|
|
18
|
+
|
|
19
|
+
app = typer.Typer(help="Config-driven AI content ingestion and RAG dataset library.")
|
|
20
|
+
doctor_app = typer.Typer(help="Local and optional live diagnostics for provider configuration.")
|
|
21
|
+
DEFAULT_PROFILE_ARG = typer.Argument(Path("profiles/manual_safe.yaml"))
|
|
22
|
+
DEFAULT_PROFILE_OPT = typer.Option(Path("profiles/manual_safe.yaml"))
|
|
23
|
+
DESTINATION_PROFILE_OPT = typer.Option(Path("profiles/destination_example.yaml"))
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@app.command()
|
|
27
|
+
def init(target: Path = DEFAULT_PROFILE_ARG):
|
|
28
|
+
"""Create a starter profile file for a new project."""
|
|
29
|
+
target.parent.mkdir(parents=True, exist_ok=True)
|
|
30
|
+
if not target.exists():
|
|
31
|
+
text = (resources.files("ingestforge.profiles") / "manual_safe.yaml").read_text(
|
|
32
|
+
encoding="utf-8"
|
|
33
|
+
)
|
|
34
|
+
target.write_text(text, encoding="utf-8")
|
|
35
|
+
created = True
|
|
36
|
+
else:
|
|
37
|
+
created = False
|
|
38
|
+
typer.echo(
|
|
39
|
+
json.dumps(
|
|
40
|
+
{
|
|
41
|
+
"version": __version__,
|
|
42
|
+
"profile": str(target),
|
|
43
|
+
"created": created,
|
|
44
|
+
"next": f"ingestforge ingest-url https://example.com/article --profile {target}",
|
|
45
|
+
},
|
|
46
|
+
indent=2,
|
|
47
|
+
)
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@app.command("validate-profile")
|
|
52
|
+
def validate_profile_cmd(profile: Path):
|
|
53
|
+
p = validate_profile(profile)
|
|
54
|
+
typer.echo(p.model_dump_json(indent=2))
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
@app.command("ingest-url")
|
|
58
|
+
def ingest_url(
|
|
59
|
+
url: str,
|
|
60
|
+
profile: Path = DEFAULT_PROFILE_OPT,
|
|
61
|
+
dry_run: bool = True,
|
|
62
|
+
runs_dir: Path = Path("runs"),
|
|
63
|
+
no_external_calls: bool = typer.Option(
|
|
64
|
+
False, "--no-external-calls", help="Disable all external AI/search calls for this run."
|
|
65
|
+
),
|
|
66
|
+
external_calls: str | None = typer.Option(None, "--external-calls"),
|
|
67
|
+
live: bool = typer.Option(
|
|
68
|
+
False, "--live", help="Enable live external calls according to the profile."
|
|
69
|
+
),
|
|
70
|
+
no_live: bool = typer.Option(False, "--no-live", help="Force external calls to disabled."),
|
|
71
|
+
write_dataset: bool = True,
|
|
72
|
+
):
|
|
73
|
+
effective_external_calls = external_calls
|
|
74
|
+
if no_external_calls or no_live:
|
|
75
|
+
effective_external_calls = "disabled"
|
|
76
|
+
elif live:
|
|
77
|
+
effective_external_calls = "enabled"
|
|
78
|
+
pipeline_overrides: dict[str, object] = {"dry_run": dry_run, "write_dataset": write_dataset}
|
|
79
|
+
if effective_external_calls is not None:
|
|
80
|
+
pipeline_overrides["external_calls"] = effective_external_calls
|
|
81
|
+
p = load_profile(
|
|
82
|
+
profile,
|
|
83
|
+
overrides={"pipeline": pipeline_overrides},
|
|
84
|
+
)
|
|
85
|
+
package = IngestPipeline.from_profile(p).ingest_url(
|
|
86
|
+
url,
|
|
87
|
+
dry_run=dry_run,
|
|
88
|
+
runs_dir=runs_dir,
|
|
89
|
+
write_dataset=write_dataset,
|
|
90
|
+
external_calls=effective_external_calls,
|
|
91
|
+
)
|
|
92
|
+
typer.echo(
|
|
93
|
+
json.dumps(
|
|
94
|
+
{
|
|
95
|
+
"job_id": package.job_id,
|
|
96
|
+
"package": str(runs_dir / package.job_id / "package.json"),
|
|
97
|
+
"valid": package.validation_report.is_valid,
|
|
98
|
+
},
|
|
99
|
+
indent=2,
|
|
100
|
+
)
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
@app.command("search-topic")
|
|
105
|
+
def search_topic(
|
|
106
|
+
query: str,
|
|
107
|
+
profile: Path = DEFAULT_PROFILE_OPT,
|
|
108
|
+
dry_run: bool = True,
|
|
109
|
+
no_external_calls: bool = typer.Option(
|
|
110
|
+
False, "--no-external-calls", help="Disable all external AI/search calls for this run."
|
|
111
|
+
),
|
|
112
|
+
external_calls: str | None = typer.Option(None, "--external-calls"),
|
|
113
|
+
live: bool = typer.Option(
|
|
114
|
+
False, "--live", help="Enable live external calls according to the profile."
|
|
115
|
+
),
|
|
116
|
+
no_live: bool = typer.Option(False, "--no-live", help="Force external calls to disabled."),
|
|
117
|
+
):
|
|
118
|
+
effective_external_calls = external_calls
|
|
119
|
+
if no_external_calls or no_live:
|
|
120
|
+
effective_external_calls = "disabled"
|
|
121
|
+
elif live:
|
|
122
|
+
effective_external_calls = "enabled"
|
|
123
|
+
pipeline_overrides: dict[str, object] = {"dry_run": dry_run}
|
|
124
|
+
if effective_external_calls is not None:
|
|
125
|
+
pipeline_overrides["external_calls"] = effective_external_calls
|
|
126
|
+
p = load_profile(profile, overrides={"pipeline": pipeline_overrides})
|
|
127
|
+
urls = IngestPipeline.from_profile(p).search_topic(
|
|
128
|
+
query, external_calls=effective_external_calls
|
|
129
|
+
)
|
|
130
|
+
typer.echo(json.dumps({"urls": urls}, indent=2))
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
@app.command("build-dataset")
|
|
134
|
+
def build_dataset(run_dir: Path):
|
|
135
|
+
package = read_package(run_dir / "package.json")
|
|
136
|
+
package.write_dataset(run_dir.parent)
|
|
137
|
+
typer.echo(str(run_dir))
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
@app.command("validate-package")
|
|
141
|
+
def validate_package(path: Path):
|
|
142
|
+
package = read_package(path)
|
|
143
|
+
report = package.validate_package()
|
|
144
|
+
typer.echo(report.model_dump_json(indent=2))
|
|
145
|
+
raise typer.Exit(0 if report.is_valid else 1)
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
@app.command("export-rag")
|
|
149
|
+
def export_rag(run_dir: Path):
|
|
150
|
+
out = export_jsonl(run_dir)
|
|
151
|
+
typer.echo(str(out))
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
@app.command()
|
|
155
|
+
def publish(run_dir: Path, profile: Path = DESTINATION_PROFILE_OPT):
|
|
156
|
+
package = read_package(run_dir / "package.json")
|
|
157
|
+
p = load_profile(profile)
|
|
158
|
+
if p.destination.provider == "local_export":
|
|
159
|
+
result = LocalExportDestination(run_dir.parent).publish(package)
|
|
160
|
+
else:
|
|
161
|
+
result = GenericRestDestination(p.destination).publish(package)
|
|
162
|
+
typer.echo(json.dumps(result, indent=2, ensure_ascii=False))
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
@doctor_app.callback(invoke_without_command=True)
|
|
166
|
+
def doctor_summary(ctx: typer.Context, profile: Path = DEFAULT_PROFILE_OPT):
|
|
167
|
+
if ctx.invoked_subcommand is not None:
|
|
168
|
+
return
|
|
169
|
+
p = load_profile(profile)
|
|
170
|
+
info = {
|
|
171
|
+
"python": sys.version.split()[0],
|
|
172
|
+
"profile": p.profile_name,
|
|
173
|
+
"ai_provider": p.ai.provider,
|
|
174
|
+
"destination": p.destination.provider,
|
|
175
|
+
"dry_run": p.pipeline.dry_run,
|
|
176
|
+
}
|
|
177
|
+
typer.echo(json.dumps(info, indent=2))
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
@doctor_app.command("providers")
|
|
181
|
+
def doctor_providers(
|
|
182
|
+
profile: Path = DEFAULT_PROFILE_OPT,
|
|
183
|
+
offline: bool = typer.Option(
|
|
184
|
+
False, "--offline", help="Run local payload/schema checks only; this overrides --live."
|
|
185
|
+
),
|
|
186
|
+
live: bool = typer.Option(
|
|
187
|
+
False, "--live", help="Run optional live smoke when env opt-in and credentials exist."
|
|
188
|
+
),
|
|
189
|
+
):
|
|
190
|
+
p = load_profile(profile)
|
|
191
|
+
result = run_provider_doctor(p, live=live and not offline)
|
|
192
|
+
typer.echo(json.dumps(result, indent=2, ensure_ascii=False))
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
app.add_typer(doctor_app, name="doctor")
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
@app.command()
|
|
199
|
+
def version():
|
|
200
|
+
typer.echo(__version__)
|