docpull 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docpull/__init__.py +29 -0
- docpull/__main__.py +6 -0
- docpull/cli.py +440 -0
- docpull/config.py +199 -0
- docpull/fetchers/__init__.py +23 -0
- docpull/fetchers/async_fetcher.py +322 -0
- docpull/fetchers/base.py +450 -0
- docpull/fetchers/bun.py +59 -0
- docpull/fetchers/d3.py +211 -0
- docpull/fetchers/generic.py +255 -0
- docpull/fetchers/generic_async.py +282 -0
- docpull/fetchers/nextjs.py +50 -0
- docpull/fetchers/parallel_base.py +93 -0
- docpull/fetchers/plaid.py +92 -0
- docpull/fetchers/react.py +59 -0
- docpull/fetchers/stripe.py +60 -0
- docpull/fetchers/tailwind.py +59 -0
- docpull/fetchers/turborepo.py +57 -0
- docpull/profiles/__init__.py +70 -0
- docpull/profiles/base.py +64 -0
- docpull/profiles/bun.py +14 -0
- docpull/profiles/d3.py +17 -0
- docpull/profiles/nextjs.py +15 -0
- docpull/profiles/plaid.py +16 -0
- docpull/profiles/react.py +14 -0
- docpull/profiles/stripe.py +14 -0
- docpull/profiles/tailwind.py +14 -0
- docpull/profiles/turborepo.py +14 -0
- docpull/py.typed +0 -0
- docpull/utils/__init__.py +6 -0
- docpull/utils/file_utils.py +97 -0
- docpull/utils/logging_config.py +54 -0
- docpull-1.0.1.dist-info/METADATA +440 -0
- docpull-1.0.1.dist-info/RECORD +38 -0
- docpull-1.0.1.dist-info/WHEEL +5 -0
- docpull-1.0.1.dist-info/entry_points.txt +2 -0
- docpull-1.0.1.dist-info/licenses/LICENSE +21 -0
- docpull-1.0.1.dist-info/top_level.txt +1 -0
docpull/__init__.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
__version__ = "1.0.1"
|
|
2
|
+
|
|
3
|
+
from .fetchers.base import BaseFetcher
|
|
4
|
+
from .fetchers.bun import BunFetcher
|
|
5
|
+
from .fetchers.d3 import D3DevDocsFetcher
|
|
6
|
+
from .fetchers.generic import GenericFetcher
|
|
7
|
+
from .fetchers.generic_async import GenericAsyncFetcher
|
|
8
|
+
from .fetchers.nextjs import NextJSFetcher
|
|
9
|
+
from .fetchers.parallel_base import ParallelFetcher
|
|
10
|
+
from .fetchers.plaid import PlaidFetcher
|
|
11
|
+
from .fetchers.react import ReactFetcher
|
|
12
|
+
from .fetchers.stripe import StripeFetcher
|
|
13
|
+
from .fetchers.tailwind import TailwindFetcher
|
|
14
|
+
from .fetchers.turborepo import TurborepoFetcher
|
|
15
|
+
|
|
16
|
+
__all__ = [
|
|
17
|
+
"BaseFetcher",
|
|
18
|
+
"BunFetcher",
|
|
19
|
+
"D3DevDocsFetcher",
|
|
20
|
+
"GenericFetcher",
|
|
21
|
+
"GenericAsyncFetcher",
|
|
22
|
+
"NextJSFetcher",
|
|
23
|
+
"ParallelFetcher",
|
|
24
|
+
"PlaidFetcher",
|
|
25
|
+
"ReactFetcher",
|
|
26
|
+
"StripeFetcher",
|
|
27
|
+
"TailwindFetcher",
|
|
28
|
+
"TurborepoFetcher",
|
|
29
|
+
]
|
docpull/__main__.py
ADDED
docpull/cli.py
ADDED
|
@@ -0,0 +1,440 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import sys
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Optional
|
|
5
|
+
|
|
6
|
+
from . import __version__
|
|
7
|
+
from .config import FetcherConfig
|
|
8
|
+
from .fetchers import (
|
|
9
|
+
BunFetcher,
|
|
10
|
+
D3DevDocsFetcher,
|
|
11
|
+
NextJSFetcher,
|
|
12
|
+
PlaidFetcher,
|
|
13
|
+
ReactFetcher,
|
|
14
|
+
StripeFetcher,
|
|
15
|
+
TailwindFetcher,
|
|
16
|
+
TurborepoFetcher,
|
|
17
|
+
)
|
|
18
|
+
from .fetchers.generic_async import GenericAsyncFetcher
|
|
19
|
+
from .utils.logging_config import setup_logging
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def create_parser() -> argparse.ArgumentParser:
|
|
23
|
+
"""Create argument parser for CLI."""
|
|
24
|
+
parser = argparse.ArgumentParser(
|
|
25
|
+
prog="docpull",
|
|
26
|
+
description="Fetch and convert documentation from any URL or known sources to markdown",
|
|
27
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
28
|
+
epilog="""
|
|
29
|
+
Examples:
|
|
30
|
+
# Fetch from any documentation URL
|
|
31
|
+
docpull https://aptos.dev
|
|
32
|
+
docpull https://docs.anthropic.com
|
|
33
|
+
|
|
34
|
+
# Fetch using profile names (shortcuts)
|
|
35
|
+
docpull stripe
|
|
36
|
+
docpull nextjs plaid
|
|
37
|
+
|
|
38
|
+
# Mix URLs and profiles
|
|
39
|
+
docpull stripe https://newsite.com/docs
|
|
40
|
+
|
|
41
|
+
# Control scraping depth and pages
|
|
42
|
+
docpull https://example.com/docs --max-pages 100 --max-depth 3
|
|
43
|
+
|
|
44
|
+
# Legacy syntax still works
|
|
45
|
+
docpull --source stripe --source nextjs
|
|
46
|
+
|
|
47
|
+
# Use a config file
|
|
48
|
+
docpull --config config.yaml
|
|
49
|
+
|
|
50
|
+
# Generate a sample config file
|
|
51
|
+
docpull --generate-config config.yaml
|
|
52
|
+
""",
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
# Positional arguments for URLs or profile names
|
|
56
|
+
parser.add_argument(
|
|
57
|
+
"targets",
|
|
58
|
+
nargs="*",
|
|
59
|
+
help="URLs or profile names to fetch (e.g., 'https://docs.site.com', 'stripe', 'nextjs')",
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
parser.add_argument(
|
|
63
|
+
"--config",
|
|
64
|
+
"-c",
|
|
65
|
+
type=Path,
|
|
66
|
+
help="Path to config file (YAML or JSON)",
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
parser.add_argument(
|
|
70
|
+
"--output-dir",
|
|
71
|
+
"-o",
|
|
72
|
+
type=Path,
|
|
73
|
+
default=None,
|
|
74
|
+
help="Directory to save documentation (default: ./docs)",
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
parser.add_argument(
|
|
78
|
+
"--source",
|
|
79
|
+
"-s",
|
|
80
|
+
nargs="+",
|
|
81
|
+
choices=["all", "bun", "d3", "nextjs", "plaid", "react", "stripe", "tailwind", "turborepo"],
|
|
82
|
+
default=None,
|
|
83
|
+
dest="sources",
|
|
84
|
+
help="Documentation source(s) to fetch. Use 'all' for everything. (default: all)",
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
parser.add_argument(
|
|
88
|
+
"--rate-limit",
|
|
89
|
+
"-r",
|
|
90
|
+
type=float,
|
|
91
|
+
default=None,
|
|
92
|
+
help="Seconds to wait between requests (default: 0.5)",
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
parser.add_argument(
|
|
96
|
+
"--no-skip-existing",
|
|
97
|
+
action="store_true",
|
|
98
|
+
help="Re-fetch files that already exist",
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
parser.add_argument(
|
|
102
|
+
"--log-level",
|
|
103
|
+
"-l",
|
|
104
|
+
choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
|
|
105
|
+
default=None,
|
|
106
|
+
help="Logging level (default: INFO)",
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
parser.add_argument(
|
|
110
|
+
"--verbose",
|
|
111
|
+
"-v",
|
|
112
|
+
action="store_const",
|
|
113
|
+
const="DEBUG",
|
|
114
|
+
dest="log_level_override",
|
|
115
|
+
help="Enable verbose output (equivalent to --log-level DEBUG)",
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
parser.add_argument(
|
|
119
|
+
"--quiet",
|
|
120
|
+
"-q",
|
|
121
|
+
action="store_const",
|
|
122
|
+
const="ERROR",
|
|
123
|
+
dest="log_level_override",
|
|
124
|
+
help="Suppress informational output (equivalent to --log-level ERROR)",
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
parser.add_argument(
|
|
128
|
+
"--dry-run",
|
|
129
|
+
action="store_true",
|
|
130
|
+
help="Show what would be fetched without actually downloading",
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
parser.add_argument(
|
|
134
|
+
"--max-pages",
|
|
135
|
+
type=int,
|
|
136
|
+
default=None,
|
|
137
|
+
help="Maximum number of pages to fetch (default: unlimited)",
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
parser.add_argument(
|
|
141
|
+
"--max-depth",
|
|
142
|
+
type=int,
|
|
143
|
+
default=5,
|
|
144
|
+
help="Maximum crawl depth when following links (default: 5)",
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
parser.add_argument(
|
|
148
|
+
"--max-concurrent",
|
|
149
|
+
type=int,
|
|
150
|
+
default=10,
|
|
151
|
+
help="Maximum concurrent requests for async fetching (default: 10)",
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
parser.add_argument(
|
|
155
|
+
"--js",
|
|
156
|
+
"--javascript",
|
|
157
|
+
action="store_true",
|
|
158
|
+
dest="use_js",
|
|
159
|
+
help="Enable JavaScript rendering with Playwright (slower but handles JS-heavy sites)",
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
parser.add_argument(
|
|
163
|
+
"--no-progress",
|
|
164
|
+
action="store_true",
|
|
165
|
+
help="Disable progress bars",
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
parser.add_argument(
|
|
169
|
+
"--log-file",
|
|
170
|
+
type=Path,
|
|
171
|
+
default=None,
|
|
172
|
+
help="Path to log file (default: console only)",
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
parser.add_argument(
|
|
176
|
+
"--generate-config",
|
|
177
|
+
type=Path,
|
|
178
|
+
metavar="PATH",
|
|
179
|
+
help="Generate a sample config file and exit",
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
parser.add_argument(
|
|
183
|
+
"--version",
|
|
184
|
+
action="version",
|
|
185
|
+
version=f"%(prog)s {__version__}",
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
return parser
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def generate_sample_config(output_path: Path) -> None:
|
|
192
|
+
"""
|
|
193
|
+
Generate a sample configuration file.
|
|
194
|
+
|
|
195
|
+
Args:
|
|
196
|
+
output_path: Path to save the config file
|
|
197
|
+
"""
|
|
198
|
+
config = FetcherConfig()
|
|
199
|
+
|
|
200
|
+
# Determine format from extension
|
|
201
|
+
suffix = output_path.suffix.lower()
|
|
202
|
+
|
|
203
|
+
if suffix in [".yaml", ".yml"]:
|
|
204
|
+
config.save_yaml(output_path)
|
|
205
|
+
print(f"Sample YAML config generated: {output_path}")
|
|
206
|
+
elif suffix == ".json":
|
|
207
|
+
config.save_json(output_path)
|
|
208
|
+
print(f"Sample JSON config generated: {output_path}")
|
|
209
|
+
else:
|
|
210
|
+
print(f"Warning: Unknown extension {suffix}, generating YAML")
|
|
211
|
+
output_path = output_path.with_suffix(".yaml")
|
|
212
|
+
config.save_yaml(output_path)
|
|
213
|
+
print(f"Sample YAML config generated: {output_path}")
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def get_config(args: argparse.Namespace) -> FetcherConfig:
|
|
217
|
+
"""
|
|
218
|
+
Get configuration from args and config file.
|
|
219
|
+
|
|
220
|
+
Args:
|
|
221
|
+
args: Parsed command-line arguments
|
|
222
|
+
|
|
223
|
+
Returns:
|
|
224
|
+
FetcherConfig instance
|
|
225
|
+
"""
|
|
226
|
+
# Load from config file if provided
|
|
227
|
+
config = FetcherConfig.from_file(args.config) if args.config else FetcherConfig()
|
|
228
|
+
|
|
229
|
+
# Override with command-line arguments
|
|
230
|
+
if args.output_dir is not None:
|
|
231
|
+
config.output_dir = args.output_dir
|
|
232
|
+
|
|
233
|
+
if args.sources is not None:
|
|
234
|
+
# Handle "all" keyword
|
|
235
|
+
if "all" in args.sources:
|
|
236
|
+
config.sources = ["bun", "d3", "nextjs", "plaid", "react", "stripe", "tailwind", "turborepo"]
|
|
237
|
+
else:
|
|
238
|
+
config.sources = args.sources
|
|
239
|
+
|
|
240
|
+
if args.rate_limit is not None:
|
|
241
|
+
config.rate_limit = args.rate_limit
|
|
242
|
+
|
|
243
|
+
if args.no_skip_existing:
|
|
244
|
+
config.skip_existing = False
|
|
245
|
+
|
|
246
|
+
# Handle log level (verbose/quiet shortcuts override --log-level)
|
|
247
|
+
if args.log_level_override is not None:
|
|
248
|
+
config.log_level = args.log_level_override
|
|
249
|
+
elif args.log_level is not None:
|
|
250
|
+
config.log_level = args.log_level
|
|
251
|
+
|
|
252
|
+
if args.log_file is not None:
|
|
253
|
+
config.log_file = str(args.log_file)
|
|
254
|
+
|
|
255
|
+
# Store dry-run flag in config
|
|
256
|
+
config.dry_run = args.dry_run
|
|
257
|
+
|
|
258
|
+
return config
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
def run_fetchers(config: FetcherConfig) -> int:
|
|
262
|
+
"""
|
|
263
|
+
Run the fetchers based on configuration.
|
|
264
|
+
|
|
265
|
+
Args:
|
|
266
|
+
config: FetcherConfig instance
|
|
267
|
+
|
|
268
|
+
Returns:
|
|
269
|
+
Exit code (0 for success, 1 for error)
|
|
270
|
+
"""
|
|
271
|
+
# Setup logging
|
|
272
|
+
logger = setup_logging(
|
|
273
|
+
level=config.log_level,
|
|
274
|
+
log_file=config.log_file,
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
logger.info("docpull - Documentation Fetcher")
|
|
278
|
+
logger.info(f"Mode: {'DRY RUN' if config.dry_run else 'FETCH'}")
|
|
279
|
+
logger.info(f"Output directory: {config.output_dir}")
|
|
280
|
+
logger.info(f"Rate limit: {config.rate_limit}s between requests")
|
|
281
|
+
logger.info(f"Skip existing: {config.skip_existing}")
|
|
282
|
+
logger.info(f"Sources: {', '.join(config.sources)}")
|
|
283
|
+
logger.info("")
|
|
284
|
+
|
|
285
|
+
if config.dry_run:
|
|
286
|
+
logger.info("DRY RUN MODE: No files will be downloaded")
|
|
287
|
+
logger.info("")
|
|
288
|
+
|
|
289
|
+
# Map source names to fetcher classes
|
|
290
|
+
fetcher_map = {
|
|
291
|
+
"bun": BunFetcher,
|
|
292
|
+
"d3": D3DevDocsFetcher,
|
|
293
|
+
"nextjs": NextJSFetcher,
|
|
294
|
+
"plaid": PlaidFetcher,
|
|
295
|
+
"react": ReactFetcher,
|
|
296
|
+
"stripe": StripeFetcher,
|
|
297
|
+
"tailwind": TailwindFetcher,
|
|
298
|
+
"turborepo": TurborepoFetcher,
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
# Run fetchers
|
|
302
|
+
errors = 0
|
|
303
|
+
for source in config.sources:
|
|
304
|
+
if source not in fetcher_map:
|
|
305
|
+
logger.error(f"Unknown source: {source}")
|
|
306
|
+
errors += 1
|
|
307
|
+
continue
|
|
308
|
+
|
|
309
|
+
try:
|
|
310
|
+
fetcher_class = fetcher_map[source]
|
|
311
|
+
fetcher = fetcher_class(
|
|
312
|
+
output_dir=config.output_dir,
|
|
313
|
+
rate_limit=config.rate_limit,
|
|
314
|
+
skip_existing=config.skip_existing,
|
|
315
|
+
logger=logger,
|
|
316
|
+
)
|
|
317
|
+
fetcher.fetch()
|
|
318
|
+
except Exception as e:
|
|
319
|
+
logger.error(f"Error fetching {source}: {e}", exc_info=True)
|
|
320
|
+
errors += 1
|
|
321
|
+
|
|
322
|
+
logger.info("")
|
|
323
|
+
if errors > 0:
|
|
324
|
+
logger.error(f"Completed with {errors} error(s)")
|
|
325
|
+
return 1
|
|
326
|
+
else:
|
|
327
|
+
logger.info("All documentation fetched successfully")
|
|
328
|
+
return 0
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
def run_generic_fetchers(args: argparse.Namespace) -> int:
|
|
332
|
+
"""
|
|
333
|
+
Run generic fetchers for URLs or profile names.
|
|
334
|
+
|
|
335
|
+
Args:
|
|
336
|
+
args: Parsed command-line arguments
|
|
337
|
+
|
|
338
|
+
Returns:
|
|
339
|
+
Exit code (0 for success, 1 for error)
|
|
340
|
+
"""
|
|
341
|
+
# Setup logging
|
|
342
|
+
log_level = args.log_level_override or args.log_level or "INFO"
|
|
343
|
+
logger = setup_logging(
|
|
344
|
+
level=log_level,
|
|
345
|
+
log_file=args.log_file,
|
|
346
|
+
)
|
|
347
|
+
|
|
348
|
+
output_dir = Path(args.output_dir) if args.output_dir else Path("./docs")
|
|
349
|
+
rate_limit = args.rate_limit if args.rate_limit is not None else 0.5
|
|
350
|
+
skip_existing = not args.no_skip_existing
|
|
351
|
+
max_pages = args.max_pages
|
|
352
|
+
max_depth = args.max_depth
|
|
353
|
+
max_concurrent = args.max_concurrent
|
|
354
|
+
use_js = args.use_js
|
|
355
|
+
show_progress = not args.no_progress
|
|
356
|
+
|
|
357
|
+
logger.info("docpull - Universal Documentation Fetcher")
|
|
358
|
+
logger.info(f"Targets: {', '.join(args.targets)}")
|
|
359
|
+
logger.info(f"Output directory: {output_dir}")
|
|
360
|
+
logger.info(f"Rate limit: {rate_limit}s between requests")
|
|
361
|
+
logger.info(f"Skip existing: {skip_existing}")
|
|
362
|
+
logger.info(f"Max concurrent: {max_concurrent}")
|
|
363
|
+
if max_pages:
|
|
364
|
+
logger.info(f"Max pages: {max_pages}")
|
|
365
|
+
logger.info(f"Max depth: {max_depth}")
|
|
366
|
+
if use_js:
|
|
367
|
+
logger.info("JavaScript rendering: ENABLED (slower but handles JS sites)")
|
|
368
|
+
logger.info("")
|
|
369
|
+
|
|
370
|
+
# Run async generic fetcher for each target
|
|
371
|
+
errors = 0
|
|
372
|
+
for target in args.targets:
|
|
373
|
+
try:
|
|
374
|
+
logger.info(f"Fetching: {target}")
|
|
375
|
+
fetcher = GenericAsyncFetcher(
|
|
376
|
+
url_or_profile=target,
|
|
377
|
+
output_dir=output_dir,
|
|
378
|
+
rate_limit=rate_limit,
|
|
379
|
+
skip_existing=skip_existing,
|
|
380
|
+
logger=logger,
|
|
381
|
+
max_pages=max_pages,
|
|
382
|
+
max_depth=max_depth,
|
|
383
|
+
max_concurrent=max_concurrent,
|
|
384
|
+
use_js=use_js,
|
|
385
|
+
show_progress=show_progress,
|
|
386
|
+
)
|
|
387
|
+
fetcher.fetch() # This calls asyncio.run() internally
|
|
388
|
+
except Exception as e:
|
|
389
|
+
logger.error(f"Error fetching {target}: {e}", exc_info=True)
|
|
390
|
+
errors += 1
|
|
391
|
+
|
|
392
|
+
logger.info("")
|
|
393
|
+
if errors > 0:
|
|
394
|
+
logger.error(f"Completed with {errors} error(s)")
|
|
395
|
+
return 1
|
|
396
|
+
else:
|
|
397
|
+
logger.info("All documentation fetched successfully")
|
|
398
|
+
return 0
|
|
399
|
+
|
|
400
|
+
|
|
401
|
+
def main(argv: Optional[list[str]] = None) -> int:
|
|
402
|
+
"""
|
|
403
|
+
Main entry point for CLI.
|
|
404
|
+
|
|
405
|
+
Args:
|
|
406
|
+
argv: Command-line arguments (defaults to sys.argv)
|
|
407
|
+
|
|
408
|
+
Returns:
|
|
409
|
+
Exit code
|
|
410
|
+
"""
|
|
411
|
+
parser = create_parser()
|
|
412
|
+
args = parser.parse_args(argv)
|
|
413
|
+
|
|
414
|
+
# Handle --generate-config
|
|
415
|
+
if args.generate_config:
|
|
416
|
+
try:
|
|
417
|
+
generate_sample_config(args.generate_config)
|
|
418
|
+
return 0
|
|
419
|
+
except Exception as e:
|
|
420
|
+
print(f"Error generating config: {e}", file=sys.stderr)
|
|
421
|
+
return 1
|
|
422
|
+
|
|
423
|
+
# Determine if using new URL-based interface or legacy source-based
|
|
424
|
+
use_generic = bool(args.targets)
|
|
425
|
+
|
|
426
|
+
if use_generic:
|
|
427
|
+
# New URL-based interface
|
|
428
|
+
return run_generic_fetchers(args)
|
|
429
|
+
else:
|
|
430
|
+
# Legacy source-based interface
|
|
431
|
+
try:
|
|
432
|
+
config = get_config(args)
|
|
433
|
+
except Exception as e:
|
|
434
|
+
print(f"Error loading configuration: {e}", file=sys.stderr)
|
|
435
|
+
return 1
|
|
436
|
+
return run_fetchers(config)
|
|
437
|
+
|
|
438
|
+
|
|
439
|
+
if __name__ == "__main__":
|
|
440
|
+
sys.exit(main())
|
docpull/config.py
ADDED
|
@@ -0,0 +1,199 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Any, Optional
|
|
4
|
+
|
|
5
|
+
try:
|
|
6
|
+
import yaml # type: ignore
|
|
7
|
+
except ImportError:
|
|
8
|
+
yaml = None # type: ignore
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class FetcherConfig:
|
|
12
|
+
"""Configuration for documentation fetchers."""
|
|
13
|
+
|
|
14
|
+
def __init__(
|
|
15
|
+
self,
|
|
16
|
+
output_dir: str = "./docs",
|
|
17
|
+
rate_limit: float = 0.5,
|
|
18
|
+
skip_existing: bool = True,
|
|
19
|
+
log_level: str = "INFO",
|
|
20
|
+
log_file: Optional[str] = None,
|
|
21
|
+
sources: Optional[list[str]] = None,
|
|
22
|
+
dry_run: bool = False,
|
|
23
|
+
):
|
|
24
|
+
"""
|
|
25
|
+
Initialize configuration.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
output_dir: Directory to save documentation
|
|
29
|
+
rate_limit: Seconds between requests
|
|
30
|
+
skip_existing: Skip existing files
|
|
31
|
+
log_level: Logging level
|
|
32
|
+
log_file: Optional log file path
|
|
33
|
+
sources: List of sources to fetch (e.g., ['stripe', 'plaid'])
|
|
34
|
+
dry_run: Dry run mode (don't download files)
|
|
35
|
+
"""
|
|
36
|
+
self.output_dir = Path(output_dir)
|
|
37
|
+
self.rate_limit = rate_limit
|
|
38
|
+
self.skip_existing = skip_existing
|
|
39
|
+
self.log_level = log_level
|
|
40
|
+
self.log_file = log_file
|
|
41
|
+
self.sources = sources or ["plaid", "stripe"]
|
|
42
|
+
self.dry_run = dry_run
|
|
43
|
+
|
|
44
|
+
@classmethod
|
|
45
|
+
def from_dict(cls, config_dict: dict[str, Any]) -> "FetcherConfig":
|
|
46
|
+
"""
|
|
47
|
+
Create configuration from dictionary.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
config_dict: Configuration dictionary
|
|
51
|
+
|
|
52
|
+
Returns:
|
|
53
|
+
FetcherConfig instance
|
|
54
|
+
|
|
55
|
+
Raises:
|
|
56
|
+
ValueError: If configuration values are invalid
|
|
57
|
+
"""
|
|
58
|
+
# Validate output_dir doesn't contain path traversal
|
|
59
|
+
output_dir = str(config_dict.get("output_dir", "./docs"))
|
|
60
|
+
if ".." in output_dir or output_dir.startswith("/etc") or output_dir.startswith("/sys"):
|
|
61
|
+
raise ValueError("Invalid output directory path")
|
|
62
|
+
|
|
63
|
+
# Validate rate_limit is reasonable
|
|
64
|
+
rate_limit = config_dict.get("rate_limit", 0.5)
|
|
65
|
+
if not isinstance(rate_limit, (int, float)) or rate_limit < 0 or rate_limit > 60:
|
|
66
|
+
raise ValueError("rate_limit must be between 0 and 60")
|
|
67
|
+
|
|
68
|
+
# Validate sources
|
|
69
|
+
valid_sources = {"bun", "d3", "nextjs", "plaid", "react", "stripe", "tailwind", "turborepo"}
|
|
70
|
+
sources = config_dict.get("sources", ["plaid", "stripe"])
|
|
71
|
+
if not all(s in valid_sources for s in sources):
|
|
72
|
+
raise ValueError(f"Invalid sources. Must be from: {valid_sources}")
|
|
73
|
+
|
|
74
|
+
# Validate log_level
|
|
75
|
+
log_level = config_dict.get("log_level", "INFO")
|
|
76
|
+
valid_log_levels = {"DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"}
|
|
77
|
+
if log_level.upper() not in valid_log_levels:
|
|
78
|
+
raise ValueError(f"Invalid log_level. Must be one of: {valid_log_levels}")
|
|
79
|
+
|
|
80
|
+
return cls(
|
|
81
|
+
output_dir=output_dir,
|
|
82
|
+
rate_limit=rate_limit,
|
|
83
|
+
skip_existing=config_dict.get("skip_existing", True),
|
|
84
|
+
log_level=log_level,
|
|
85
|
+
log_file=config_dict.get("log_file"),
|
|
86
|
+
sources=sources,
|
|
87
|
+
dry_run=config_dict.get("dry_run", False),
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
@classmethod
|
|
91
|
+
def from_yaml(cls, yaml_path: Path) -> "FetcherConfig":
|
|
92
|
+
"""
|
|
93
|
+
Load configuration from YAML file.
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
yaml_path: Path to YAML config file
|
|
97
|
+
|
|
98
|
+
Returns:
|
|
99
|
+
FetcherConfig instance
|
|
100
|
+
|
|
101
|
+
Raises:
|
|
102
|
+
ImportError: If pyyaml is not installed
|
|
103
|
+
FileNotFoundError: If config file doesn't exist
|
|
104
|
+
"""
|
|
105
|
+
if yaml is None:
|
|
106
|
+
raise ImportError("PyYAML is required for YAML config. Install with: pip install pyyaml")
|
|
107
|
+
|
|
108
|
+
if not yaml_path.exists():
|
|
109
|
+
raise FileNotFoundError(f"Config file not found: {yaml_path}")
|
|
110
|
+
|
|
111
|
+
with open(yaml_path) as f:
|
|
112
|
+
config_dict = yaml.safe_load(f)
|
|
113
|
+
|
|
114
|
+
return cls.from_dict(config_dict)
|
|
115
|
+
|
|
116
|
+
@classmethod
|
|
117
|
+
def from_json(cls, json_path: Path) -> "FetcherConfig":
|
|
118
|
+
"""
|
|
119
|
+
Load configuration from JSON file.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
json_path: Path to JSON config file
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
FetcherConfig instance
|
|
126
|
+
|
|
127
|
+
Raises:
|
|
128
|
+
FileNotFoundError: If config file doesn't exist
|
|
129
|
+
"""
|
|
130
|
+
if not json_path.exists():
|
|
131
|
+
raise FileNotFoundError(f"Config file not found: {json_path}")
|
|
132
|
+
|
|
133
|
+
with open(json_path) as f:
|
|
134
|
+
config_dict = json.load(f)
|
|
135
|
+
|
|
136
|
+
return cls.from_dict(config_dict)
|
|
137
|
+
|
|
138
|
+
@classmethod
|
|
139
|
+
def from_file(cls, config_path: Path) -> "FetcherConfig":
|
|
140
|
+
"""
|
|
141
|
+
Load configuration from file (auto-detect format).
|
|
142
|
+
|
|
143
|
+
Args:
|
|
144
|
+
config_path: Path to config file
|
|
145
|
+
|
|
146
|
+
Returns:
|
|
147
|
+
FetcherConfig instance
|
|
148
|
+
"""
|
|
149
|
+
suffix = config_path.suffix.lower()
|
|
150
|
+
|
|
151
|
+
if suffix in [".yaml", ".yml"]:
|
|
152
|
+
return cls.from_yaml(config_path)
|
|
153
|
+
elif suffix == ".json":
|
|
154
|
+
return cls.from_json(config_path)
|
|
155
|
+
else:
|
|
156
|
+
raise ValueError(f"Unsupported config file format: {suffix}")
|
|
157
|
+
|
|
158
|
+
def to_dict(self) -> dict[str, Any]:
|
|
159
|
+
"""
|
|
160
|
+
Convert configuration to dictionary.
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
Configuration as dictionary
|
|
164
|
+
"""
|
|
165
|
+
return {
|
|
166
|
+
"output_dir": str(self.output_dir),
|
|
167
|
+
"rate_limit": self.rate_limit,
|
|
168
|
+
"skip_existing": self.skip_existing,
|
|
169
|
+
"log_level": self.log_level,
|
|
170
|
+
"log_file": self.log_file,
|
|
171
|
+
"sources": self.sources,
|
|
172
|
+
"dry_run": self.dry_run,
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
def save_yaml(self, yaml_path: Path) -> None:
|
|
176
|
+
"""
|
|
177
|
+
Save configuration to YAML file.
|
|
178
|
+
|
|
179
|
+
Args:
|
|
180
|
+
yaml_path: Path to save YAML config
|
|
181
|
+
|
|
182
|
+
Raises:
|
|
183
|
+
ImportError: If pyyaml is not installed
|
|
184
|
+
"""
|
|
185
|
+
if yaml is None:
|
|
186
|
+
raise ImportError("PyYAML is required for YAML config. Install with: pip install pyyaml")
|
|
187
|
+
|
|
188
|
+
with open(yaml_path, "w") as f:
|
|
189
|
+
yaml.dump(self.to_dict(), f, default_flow_style=False, sort_keys=False)
|
|
190
|
+
|
|
191
|
+
def save_json(self, json_path: Path) -> None:
|
|
192
|
+
"""
|
|
193
|
+
Save configuration to JSON file.
|
|
194
|
+
|
|
195
|
+
Args:
|
|
196
|
+
json_path: Path to save JSON config
|
|
197
|
+
"""
|
|
198
|
+
with open(json_path, "w") as f:
|
|
199
|
+
json.dump(self.to_dict(), f, indent=2)
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
from .base import BaseFetcher
|
|
2
|
+
from .bun import BunFetcher
|
|
3
|
+
from .d3 import D3DevDocsFetcher
|
|
4
|
+
from .nextjs import NextJSFetcher
|
|
5
|
+
from .parallel_base import ParallelFetcher
|
|
6
|
+
from .plaid import PlaidFetcher
|
|
7
|
+
from .react import ReactFetcher
|
|
8
|
+
from .stripe import StripeFetcher
|
|
9
|
+
from .tailwind import TailwindFetcher
|
|
10
|
+
from .turborepo import TurborepoFetcher
|
|
11
|
+
|
|
12
|
+
__all__ = [
|
|
13
|
+
"BaseFetcher",
|
|
14
|
+
"BunFetcher",
|
|
15
|
+
"D3DevDocsFetcher",
|
|
16
|
+
"NextJSFetcher",
|
|
17
|
+
"ParallelFetcher",
|
|
18
|
+
"PlaidFetcher",
|
|
19
|
+
"ReactFetcher",
|
|
20
|
+
"StripeFetcher",
|
|
21
|
+
"TailwindFetcher",
|
|
22
|
+
"TurborepoFetcher",
|
|
23
|
+
]
|