PyPI - docpull - Versions diffs - 1.3.0__tar.gz → 2.0.0__tar.gz - Mend

docpull 1.3.0tar.gz → 2.0.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (120) hide show

docpull-2.0.0/PKG-INFO +207 -0
docpull-2.0.0/README.md +130 -0
{docpull-1.3.0 → docpull-2.0.0}/pyproject.toml +35 -5
docpull-2.0.0/src/docpull/__init__.py +56 -0
docpull-2.0.0/src/docpull/cache/__init__.py +12 -0
docpull-2.0.0/src/docpull/cache/manager.py +388 -0
docpull-2.0.0/src/docpull/cache/streaming_dedup.py +135 -0
docpull-2.0.0/src/docpull/cli.py +408 -0
docpull-2.0.0/src/docpull/concurrency/__init__.py +15 -0
docpull-2.0.0/src/docpull/concurrency/browser_pool.py +337 -0
docpull-2.0.0/src/docpull/concurrency/manager.py +111 -0
docpull-2.0.0/src/docpull/conversion/__init__.py +15 -0
docpull-2.0.0/src/docpull/conversion/extractor.py +246 -0
docpull-2.0.0/src/docpull/conversion/markdown.py +201 -0
docpull-2.0.0/src/docpull/conversion/protocols.py +46 -0
docpull-2.0.0/src/docpull/core/__init__.py +5 -0
docpull-2.0.0/src/docpull/core/fetcher.py +501 -0
docpull-2.0.0/src/docpull/discovery/__init__.py +29 -0
docpull-2.0.0/src/docpull/discovery/composite.py +127 -0
docpull-2.0.0/src/docpull/discovery/crawler.py +242 -0
docpull-2.0.0/src/docpull/discovery/filters.py +230 -0
docpull-2.0.0/src/docpull/discovery/protocols.py +52 -0
docpull-2.0.0/src/docpull/discovery/sitemap.py +258 -0
docpull-2.0.0/src/docpull/http/__init__.py +12 -0
docpull-2.0.0/src/docpull/http/client.py +321 -0
docpull-2.0.0/src/docpull/http/protocols.py +76 -0
docpull-2.0.0/src/docpull/http/rate_limiter.py +148 -0
{docpull-1.3.0 → docpull-2.0.0/src}/docpull/metadata_extractor.py +3 -3
docpull-2.0.0/src/docpull/models/__init__.py +37 -0
docpull-2.0.0/src/docpull/models/config.py +265 -0
docpull-2.0.0/src/docpull/models/events.py +145 -0
docpull-2.0.0/src/docpull/models/profiles.py +101 -0
docpull-2.0.0/src/docpull/pipeline/__init__.py +5 -0
docpull-2.0.0/src/docpull/pipeline/base.py +187 -0
docpull-2.0.0/src/docpull/pipeline/steps/__init__.py +17 -0
docpull-2.0.0/src/docpull/pipeline/steps/browser_fetch.py +141 -0
docpull-2.0.0/src/docpull/pipeline/steps/convert.py +134 -0
docpull-2.0.0/src/docpull/pipeline/steps/dedup.py +96 -0
docpull-2.0.0/src/docpull/pipeline/steps/fetch.py +192 -0
docpull-2.0.0/src/docpull/pipeline/steps/metadata.py +139 -0
docpull-2.0.0/src/docpull/pipeline/steps/save.py +167 -0
docpull-2.0.0/src/docpull/pipeline/steps/validate.py +140 -0
docpull-2.0.0/src/docpull/security/__init__.py +6 -0
docpull-2.0.0/src/docpull/security/robots.py +192 -0
docpull-2.0.0/src/docpull/security/url_validator.py +174 -0
docpull-2.0.0/src/docpull.egg-info/PKG-INFO +207 -0
docpull-2.0.0/src/docpull.egg-info/SOURCES.txt +59 -0
docpull-2.0.0/src/docpull.egg-info/dependency_links.txt +1 -0
docpull-2.0.0/src/docpull.egg-info/entry_points.txt +2 -0
docpull-2.0.0/src/docpull.egg-info/requires.txt +39 -0
docpull-2.0.0/src/docpull.egg-info/top_level.txt +1 -0
docpull-2.0.0/tests/test_v2_conversion.py +294 -0
docpull-2.0.0/tests/test_v2_discovery.py +355 -0
docpull-2.0.0/tests/test_v2_integration.py +359 -0
docpull-2.0.0/tests/test_v2_pipeline.py +369 -0
docpull-1.3.0/.editorconfig +0 -30
docpull-1.3.0/.pre-commit-config.yaml +0 -30
docpull-1.3.0/CHANGELOG.md +0 -403
docpull-1.3.0/CONTRIBUTING.md +0 -189
docpull-1.3.0/MANIFEST.in +0 -49
docpull-1.3.0/Makefile +0 -44
docpull-1.3.0/PKG-INFO +0 -459
docpull-1.3.0/README.md +0 -389
docpull-1.3.0/SECURITY.md +0 -206
docpull-1.3.0/TROUBLESHOOTING.md +0 -348
docpull-1.3.0/docpull/__init__.py +0 -15
docpull-1.3.0/docpull/archive.py +0 -186
docpull-1.3.0/docpull/cache.py +0 -256
docpull-1.3.0/docpull/cli.py +0 -851
docpull-1.3.0/docpull/config.py +0 -316
docpull-1.3.0/docpull/fetchers/__init__.py +0 -9
docpull-1.3.0/docpull/fetchers/async_fetcher.py +0 -322
docpull-1.3.0/docpull/fetchers/base.py +0 -502
docpull-1.3.0/docpull/fetchers/generic.py +0 -255
docpull-1.3.0/docpull/fetchers/generic_async.py +0 -290
docpull-1.3.0/docpull/fetchers/parallel_base.py +0 -93
docpull-1.3.0/docpull/fetchers/stripe.py +0 -49
docpull-1.3.0/docpull/formatters/__init__.py +0 -50
docpull-1.3.0/docpull/formatters/base.py +0 -102
docpull-1.3.0/docpull/formatters/json.py +0 -100
docpull-1.3.0/docpull/formatters/markdown.py +0 -49
docpull-1.3.0/docpull/formatters/sqlite.py +0 -266
docpull-1.3.0/docpull/formatters/toon.py +0 -90
docpull-1.3.0/docpull/hooks.py +0 -222
docpull-1.3.0/docpull/indexer.py +0 -410
docpull-1.3.0/docpull/metadata.py +0 -224
docpull-1.3.0/docpull/naming.py +0 -259
docpull-1.3.0/docpull/orchestrator.py +0 -254
docpull-1.3.0/docpull/processors/__init__.py +0 -18
docpull-1.3.0/docpull/processors/base.py +0 -151
docpull-1.3.0/docpull/processors/content_filter.py +0 -292
docpull-1.3.0/docpull/processors/deduplicator.py +0 -233
docpull-1.3.0/docpull/processors/language_filter.py +0 -181
docpull-1.3.0/docpull/processors/size_limiter.py +0 -221
docpull-1.3.0/docpull/profiles/__init__.py +0 -53
docpull-1.3.0/docpull/profiles/base.py +0 -64
docpull-1.3.0/docpull/profiles/stripe.py +0 -14
docpull-1.3.0/docpull/sources_config.py +0 -446
docpull-1.3.0/docpull/utils/__init__.py +0 -6
docpull-1.3.0/docpull/utils/file_utils.py +0 -97
docpull-1.3.0/docpull/vcs.py +0 -224
docpull-1.3.0/docpull.egg-info/SOURCES.txt +0 -64
docpull-1.3.0/examples/README.md +0 -280
docpull-1.3.0/examples/deduplication-strategies.yaml +0 -29
docpull-1.3.0/examples/format-conversion.yaml +0 -25
docpull-1.3.0/examples/incremental-updates.yaml +0 -26
docpull-1.3.0/examples/multi-source-optimized.yaml +0 -45
docpull-1.3.0/examples/selective-crawling.yaml +0 -26
docpull-1.3.0/examples/simple-optimization.yaml +0 -14
docpull-1.3.0/requirements.txt +0 -34
docpull-1.3.0/tests/test_config.py +0 -43
docpull-1.3.0/tests/test_metadata_extractor.py +0 -233
docpull-1.3.0/tests/test_orchestrator.py +0 -331
docpull-1.3.0/tests/test_sources_config.py +0 -348
{docpull-1.3.0 → docpull-2.0.0}/LICENSE +0 -0
{docpull-1.3.0 → docpull-2.0.0}/setup.cfg +0 -0
{docpull-1.3.0 → docpull-2.0.0/src}/docpull/__main__.py +0 -0
{docpull-1.3.0 → docpull-2.0.0/src}/docpull/doctor.py +0 -0
{docpull-1.3.0/docpull/utils → docpull-2.0.0/src/docpull}/logging_config.py +0 -0
{docpull-1.3.0 → docpull-2.0.0/src}/docpull/py.typed +0 -0

docpull-2.0.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,207 @@
+Metadata-Version: 2.4
+Name: docpull
+Version: 2.0.0
+Summary: Pull documentation from the web and convert to clean markdown
+Author-email: Zachary Roth <support@raintree.technology>
+Maintainer-email: Raintree Technology <support@raintree.technology>
+License-Expression: MIT
+Project-URL: Homepage, https://github.com/raintree-technology/docpull
+Project-URL: Documentation, https://github.com/raintree-technology/docpull#readme
+Project-URL: Repository, https://github.com/raintree-technology/docpull
+Project-URL: Source Code, https://github.com/raintree-technology/docpull
+Project-URL: Bug Tracker, https://github.com/raintree-technology/docpull/issues
+Project-URL: Releases, https://github.com/raintree-technology/docpull/releases
+Keywords: python,markdown,documentation,web-scraping,developer-tools,claude,ai-training-data
+Classifier: Development Status :: 5 - Production/Stable
+Classifier: Intended Audience :: Developers
+Classifier: Intended Audience :: Information Technology
+Classifier: Intended Audience :: Science/Research
+Classifier: Intended Audience :: Education
+Classifier: Environment :: Console
+Classifier: Topic :: Documentation
+Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
+Classifier: Topic :: Software Development :: Documentation
+Classifier: Topic :: Text Processing :: Markup :: HTML
+Classifier: Topic :: Text Processing :: Markup :: Markdown
+Classifier: Topic :: Utilities
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Classifier: Natural Language :: English
+Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Classifier: Programming Language :: Python :: 3.14
+Classifier: Programming Language :: Python :: 3 :: Only
+Classifier: Typing :: Typed
+Requires-Python: >=3.9
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: requests>=2.31.0
+Requires-Dist: beautifulsoup4>=4.12.0
+Requires-Dist: html2text>=2020.1.16
+Requires-Dist: defusedxml>=0.7.1
+Requires-Dist: extruct>=0.15.0
+Requires-Dist: aiohttp>=3.9.0
+Requires-Dist: rich>=13.0.0
+Requires-Dist: pyyaml>=6.0
+Requires-Dist: gitpython>=3.1.40
+Requires-Dist: pydantic>=2.0
+Provides-Extra: js
+Requires-Dist: playwright>=1.40.0; extra == "js"
+Provides-Extra: proxy
+Requires-Dist: aiohttp-socks>=0.8.0; extra == "proxy"
+Provides-Extra: normalize
+Requires-Dist: url-normalize>=1.4.0; extra == "normalize"
+Provides-Extra: all
+Requires-Dist: playwright>=1.40.0; extra == "all"
+Requires-Dist: aiohttp-socks>=0.8.0; extra == "all"
+Requires-Dist: url-normalize>=1.4.0; extra == "all"
+Provides-Extra: dev
+Requires-Dist: pytest>=7.0.0; extra == "dev"
+Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
+Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
+Requires-Dist: black>=23.0.0; extra == "dev"
+Requires-Dist: mypy>=1.0.0; extra == "dev"
+Requires-Dist: ruff>=0.1.0; extra == "dev"
+Requires-Dist: bandit>=1.7.0; extra == "dev"
+Requires-Dist: pip-audit>=2.0.0; extra == "dev"
+Requires-Dist: pre-commit>=3.0.0; extra == "dev"
+Requires-Dist: types-requests>=2.31.0; extra == "dev"
+Requires-Dist: types-beautifulsoup4>=4.12.0; extra == "dev"
+Requires-Dist: types-defusedxml>=0.7.0; extra == "dev"
+Requires-Dist: types-pyyaml>=6.0.0; extra == "dev"
+Dynamic: license-file
+# docpull
+**Pull documentation from any website and convert it to clean, AI-ready Markdown.**
+[![Python 3.9+](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/downloads/)
+[![PyPI version](https://badge.fury.io/py/docpull.svg)](https://badge.fury.io/py/docpull)
+[![License: MIT](https://img.shields.io/github/license/raintree-technology/docpull)](https://github.com/raintree-technology/docpull/blob/main/LICENSE)
+## Install
+```bash
+pip install docpull
+```
+## Usage
+```bash
+# Basic fetch
+docpull https://docs.example.com
+# With options
+docpull https://aptos.dev --max-pages 100 --output-dir ./docs
+# Filter paths
+docpull https://docs.example.com --include-paths "/api/*" --exclude-paths "/changelog/*"
+# Enable caching for incremental updates
+docpull https://docs.example.com --cache
+# JavaScript-heavy sites
+pip install docpull[js]
+docpull https://spa-site.com --js
+```
+## Profiles
+```bash
+docpull https://site.com --profile rag      # Optimized for RAG/LLM (default)
+docpull https://site.com --profile mirror   # Full site archive with caching
+docpull https://site.com --profile quick    # Fast sampling (50 pages, depth 2)
+```
+## Options
+```
+Crawl:
+  --max-pages N           Maximum pages to fetch
+  --max-depth N           Maximum crawl depth
+  --include-paths P       Only crawl matching URL patterns
+  --exclude-paths P       Skip matching URL patterns
+  --js                    Enable JavaScript rendering
+Cache:
+  --cache                 Enable caching for incremental updates
+  --cache-dir DIR         Cache directory (default: .docpull-cache)
+  --cache-ttl DAYS        Days before cache expires (default: 30)
+Content:
+  --streaming-dedup       Real-time duplicate detection
+  --language CODE         Filter by language (e.g., en)
+Output:
+  --output-dir, -o DIR    Output directory (default: ./docs)
+  --dry-run               Show what would be fetched
+  --verbose, -v           Verbose output
+```
+See `docpull --help` for all options.
+## Python API
+```python
+import asyncio
+from docpull import Fetcher, DocpullConfig, ProfileName, EventType
+async def main():
+    config = DocpullConfig(
+        url="https://docs.example.com",
+        profile=ProfileName.RAG,
+        crawl={"max_pages": 100},
+        cache={"enabled": True},
+    )
+    async with Fetcher(config) as fetcher:
+        async for event in fetcher.run():
+            if event.type == EventType.FETCH_PROGRESS:
+                print(f"{event.current}/{event.total}: {event.url}")
+        print(f"Done: {fetcher.stats.pages_fetched} pages")
+asyncio.run(main())
+```
+## Output
+Each page becomes a Markdown file with YAML frontmatter:
+```markdown
+---
+title: "Getting Started"
+source: https://docs.example.com/guide
+---
+# Getting Started
+...
+```
+## Security
+- HTTPS-only, mandatory robots.txt compliance
+- Blocks private/internal network IPs
+- Path traversal and XXE protection
+## Troubleshooting
+```bash
+docpull --doctor              # Check installation
+docpull URL --verbose         # Verbose output
+docpull URL --dry-run         # Test without downloading
+```
+## Links
+- [PyPI](https://pypi.org/project/docpull/)
+- [GitHub](https://github.com/raintree-technology/docpull)
+- [Changelog](https://github.com/raintree-technology/docpull/blob/main/docs/CHANGELOG.md)
+## License
+MIT

docpull-2.0.0/README.md ADDED Viewed

@@ -0,0 +1,130 @@
+# docpull
+**Pull documentation from any website and convert it to clean, AI-ready Markdown.**
+[![Python 3.9+](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/downloads/)
+[![PyPI version](https://badge.fury.io/py/docpull.svg)](https://badge.fury.io/py/docpull)
+[![License: MIT](https://img.shields.io/github/license/raintree-technology/docpull)](https://github.com/raintree-technology/docpull/blob/main/LICENSE)
+## Install
+```bash
+pip install docpull
+```
+## Usage
+```bash
+# Basic fetch
+docpull https://docs.example.com
+# With options
+docpull https://aptos.dev --max-pages 100 --output-dir ./docs
+# Filter paths
+docpull https://docs.example.com --include-paths "/api/*" --exclude-paths "/changelog/*"
+# Enable caching for incremental updates
+docpull https://docs.example.com --cache
+# JavaScript-heavy sites
+pip install docpull[js]
+docpull https://spa-site.com --js
+```
+## Profiles
+```bash
+docpull https://site.com --profile rag      # Optimized for RAG/LLM (default)
+docpull https://site.com --profile mirror   # Full site archive with caching
+docpull https://site.com --profile quick    # Fast sampling (50 pages, depth 2)
+```
+## Options
+```
+Crawl:
+  --max-pages N           Maximum pages to fetch
+  --max-depth N           Maximum crawl depth
+  --include-paths P       Only crawl matching URL patterns
+  --exclude-paths P       Skip matching URL patterns
+  --js                    Enable JavaScript rendering
+Cache:
+  --cache                 Enable caching for incremental updates
+  --cache-dir DIR         Cache directory (default: .docpull-cache)
+  --cache-ttl DAYS        Days before cache expires (default: 30)
+Content:
+  --streaming-dedup       Real-time duplicate detection
+  --language CODE         Filter by language (e.g., en)
+Output:
+  --output-dir, -o DIR    Output directory (default: ./docs)
+  --dry-run               Show what would be fetched
+  --verbose, -v           Verbose output
+```
+See `docpull --help` for all options.
+## Python API
+```python
+import asyncio
+from docpull import Fetcher, DocpullConfig, ProfileName, EventType
+async def main():
+    config = DocpullConfig(
+        url="https://docs.example.com",
+        profile=ProfileName.RAG,
+        crawl={"max_pages": 100},
+        cache={"enabled": True},
+    )
+    async with Fetcher(config) as fetcher:
+        async for event in fetcher.run():
+            if event.type == EventType.FETCH_PROGRESS:
+                print(f"{event.current}/{event.total}: {event.url}")
+        print(f"Done: {fetcher.stats.pages_fetched} pages")
+asyncio.run(main())
+```
+## Output
+Each page becomes a Markdown file with YAML frontmatter:
+```markdown
+---
+title: "Getting Started"
+source: https://docs.example.com/guide
+---
+# Getting Started
+...
+```
+## Security
+- HTTPS-only, mandatory robots.txt compliance
+- Blocks private/internal network IPs
+- Path traversal and XXE protection
+## Troubleshooting
+```bash
+docpull --doctor              # Check installation
+docpull URL --verbose         # Verbose output
+docpull URL --dry-run         # Test without downloading
+```
+## Links
+- [PyPI](https://pypi.org/project/docpull/)
+- [GitHub](https://github.com/raintree-technology/docpull)
+- [Changelog](https://github.com/raintree-technology/docpull/blob/main/docs/CHANGELOG.md)
+## License
+MIT

{docpull-1.3.0 → docpull-2.0.0}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "docpull"
-version = "1.3.0"
+version = "2.0.0"
 dynamic = []
 description = "Pull documentation from the web and convert to clean markdown"
 readme = {file = "README.md", content-type = "text/markdown"}
@@ -72,14 +72,23 @@ dependencies = [
     "rich>=13.0.0",
     "pyyaml>=6.0",
     "gitpython>=3.1.40",
+    "pydantic>=2.0",
 ]
 [project.optional-dependencies]
 js = [
     "playwright>=1.40.0",
 ]
+proxy = [
+    "aiohttp-socks>=0.8.0",
+]
+normalize = [
+    "url-normalize>=1.4.0",
+]
 all = [
     "playwright>=1.40.0",
+    "aiohttp-socks>=0.8.0",
+    "url-normalize>=1.4.0",
 ]
 dev = [
     "pytest>=7.0.0",
@@ -106,10 +115,10 @@ Documentation = "https://github.com/raintree-technology/docpull#readme"
 Repository = "https://github.com/raintree-technology/docpull"
 "Source Code" = "https://github.com/raintree-technology/docpull"
 "Bug Tracker" = "https://github.com/raintree-technology/docpull/issues"
-"Changelog" = "https://github.com/raintree-technology/docpull/blob/main/CHANGELOG.md"
+"Releases" = "https://github.com/raintree-technology/docpull/releases"
 [tool.setuptools.packages.find]
-where = ["."]
+where = ["src"]
 include = ["docpull*"]
 [tool.setuptools.package-data]
@@ -125,7 +134,7 @@ target-version = "py39"
 [tool.ruff.lint]
 select = ["E", "F", "W", "I", "N", "UP", "B", "A", "C4", "SIM"]
-ignore = []
+ignore = ["A003"]  # Allow 'type' and 'format' as field names in data models
 [tool.mypy]
 python_version = "3.9"
@@ -136,7 +145,14 @@ disallow_any_unimported = true
 no_implicit_optional = true
 strict_equality = true
 warn_redundant_casts = true
+ignore_missing_imports = true
 exclude = ["tests/"]
+plugins = ["pydantic.mypy"]
+[tool.pydantic-mypy]
+init_forbid_extra = true
+init_typed = true
+warn_required_dynamic_aliases = true
 [[tool.mypy.overrides]]
 module = "playwright.*"
@@ -146,6 +162,20 @@ ignore_missing_imports = true
 module = "extruct.*"
 ignore_missing_imports = true
+[[tool.mypy.overrides]]
+module = "url_normalize"
+ignore_missing_imports = true
+[[tool.mypy.overrides]]
+module = "docpull.models.*"
+disallow_any_unimported = false
+warn_return_any = false
+[[tool.mypy.overrides]]
+module = "docpull.concurrency.browser_pool"
+disallow_any_unimported = false
+warn_return_any = false
 [[tool.mypy.overrides]]
 module = "tests.*"
 disallow_untyped_defs = false
@@ -167,7 +197,7 @@ markers = [
 ]
 [tool.coverage.run]
-source = ["docpull"]
+source = ["src/docpull"]
 omit = ["tests/*", "*/test_*.py"]
 [tool.coverage.report]

docpull-2.0.0/src/docpull/__init__.py ADDED Viewed

@@ -0,0 +1,56 @@
+"""
+docpull - Fetch and convert documentation from any URL to markdown.
+Usage:
+    from docpull import Fetcher, DocpullConfig, ProfileName
+    config = DocpullConfig(
+        url="https://docs.example.com",
+        profile=ProfileName.RAG,
+    )
+    async with Fetcher(config) as fetcher:
+        async for event in fetcher.run():
+            print(event)
+"""
+__version__ = "2.0.0"
+from .cache import CacheManager, StreamingDeduplicator
+from .core.fetcher import Fetcher, fetch_blocking
+from .models.config import (
+    CacheConfig,
+    ContentFilterConfig,
+    CrawlConfig,
+    DocpullConfig,
+    IntegrationConfig,
+    NetworkConfig,
+    OutputConfig,
+    PerformanceConfig,
+    ProfileName,
+)
+from .models.events import EventType, FetchEvent, FetchStats
+__all__ = [
+    "__version__",
+    # Core
+    "Fetcher",
+    "fetch_blocking",
+    # Config
+    "DocpullConfig",
+    "ProfileName",
+    "CrawlConfig",
+    "ContentFilterConfig",
+    "OutputConfig",
+    "NetworkConfig",
+    "PerformanceConfig",
+    "IntegrationConfig",
+    "CacheConfig",
+    # Events
+    "EventType",
+    "FetchEvent",
+    "FetchStats",
+    # Cache
+    "CacheManager",
+    "StreamingDeduplicator",
+]

docpull-2.0.0/src/docpull/cache/__init__.py ADDED Viewed

@@ -0,0 +1,12 @@
+"""Caching and deduplication for docpull."""
+from .manager import DEFAULT_TTL_DAYS, CacheManager, CacheState, ManifestEntry
+from .streaming_dedup import StreamingDeduplicator
+__all__ = [
+    "CacheManager",
+    "CacheState",
+    "ManifestEntry",
+    "StreamingDeduplicator",
+    "DEFAULT_TTL_DAYS",
+]

docpull 1.3.0__tar.gz → 2.0.0__tar.gz

docpull 1.3.0tar.gz → 2.0.0tar.gz