docpull 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
docpull/__init__.py ADDED
@@ -0,0 +1,29 @@
1
+ __version__ = "1.0.1"
2
+
3
+ from .fetchers.base import BaseFetcher
4
+ from .fetchers.bun import BunFetcher
5
+ from .fetchers.d3 import D3DevDocsFetcher
6
+ from .fetchers.generic import GenericFetcher
7
+ from .fetchers.generic_async import GenericAsyncFetcher
8
+ from .fetchers.nextjs import NextJSFetcher
9
+ from .fetchers.parallel_base import ParallelFetcher
10
+ from .fetchers.plaid import PlaidFetcher
11
+ from .fetchers.react import ReactFetcher
12
+ from .fetchers.stripe import StripeFetcher
13
+ from .fetchers.tailwind import TailwindFetcher
14
+ from .fetchers.turborepo import TurborepoFetcher
15
+
16
+ __all__ = [
17
+ "BaseFetcher",
18
+ "BunFetcher",
19
+ "D3DevDocsFetcher",
20
+ "GenericFetcher",
21
+ "GenericAsyncFetcher",
22
+ "NextJSFetcher",
23
+ "ParallelFetcher",
24
+ "PlaidFetcher",
25
+ "ReactFetcher",
26
+ "StripeFetcher",
27
+ "TailwindFetcher",
28
+ "TurborepoFetcher",
29
+ ]
docpull/__main__.py ADDED
@@ -0,0 +1,6 @@
1
+ import sys
2
+
3
+ from .cli import main
4
+
5
+ if __name__ == "__main__":
6
+ sys.exit(main())
docpull/cli.py ADDED
@@ -0,0 +1,440 @@
1
+ import argparse
2
+ import sys
3
+ from pathlib import Path
4
+ from typing import Optional
5
+
6
+ from . import __version__
7
+ from .config import FetcherConfig
8
+ from .fetchers import (
9
+ BunFetcher,
10
+ D3DevDocsFetcher,
11
+ NextJSFetcher,
12
+ PlaidFetcher,
13
+ ReactFetcher,
14
+ StripeFetcher,
15
+ TailwindFetcher,
16
+ TurborepoFetcher,
17
+ )
18
+ from .fetchers.generic_async import GenericAsyncFetcher
19
+ from .utils.logging_config import setup_logging
20
+
21
+
22
+ def create_parser() -> argparse.ArgumentParser:
23
+ """Create argument parser for CLI."""
24
+ parser = argparse.ArgumentParser(
25
+ prog="docpull",
26
+ description="Fetch and convert documentation from any URL or known sources to markdown",
27
+ formatter_class=argparse.RawDescriptionHelpFormatter,
28
+ epilog="""
29
+ Examples:
30
+ # Fetch from any documentation URL
31
+ docpull https://aptos.dev
32
+ docpull https://docs.anthropic.com
33
+
34
+ # Fetch using profile names (shortcuts)
35
+ docpull stripe
36
+ docpull nextjs plaid
37
+
38
+ # Mix URLs and profiles
39
+ docpull stripe https://newsite.com/docs
40
+
41
+ # Control scraping depth and pages
42
+ docpull https://example.com/docs --max-pages 100 --max-depth 3
43
+
44
+ # Legacy syntax still works
45
+ docpull --source stripe --source nextjs
46
+
47
+ # Use a config file
48
+ docpull --config config.yaml
49
+
50
+ # Generate a sample config file
51
+ docpull --generate-config config.yaml
52
+ """,
53
+ )
54
+
55
+ # Positional arguments for URLs or profile names
56
+ parser.add_argument(
57
+ "targets",
58
+ nargs="*",
59
+ help="URLs or profile names to fetch (e.g., 'https://docs.site.com', 'stripe', 'nextjs')",
60
+ )
61
+
62
+ parser.add_argument(
63
+ "--config",
64
+ "-c",
65
+ type=Path,
66
+ help="Path to config file (YAML or JSON)",
67
+ )
68
+
69
+ parser.add_argument(
70
+ "--output-dir",
71
+ "-o",
72
+ type=Path,
73
+ default=None,
74
+ help="Directory to save documentation (default: ./docs)",
75
+ )
76
+
77
+ parser.add_argument(
78
+ "--source",
79
+ "-s",
80
+ nargs="+",
81
+ choices=["all", "bun", "d3", "nextjs", "plaid", "react", "stripe", "tailwind", "turborepo"],
82
+ default=None,
83
+ dest="sources",
84
+ help="Documentation source(s) to fetch. Use 'all' for everything. (default: all)",
85
+ )
86
+
87
+ parser.add_argument(
88
+ "--rate-limit",
89
+ "-r",
90
+ type=float,
91
+ default=None,
92
+ help="Seconds to wait between requests (default: 0.5)",
93
+ )
94
+
95
+ parser.add_argument(
96
+ "--no-skip-existing",
97
+ action="store_true",
98
+ help="Re-fetch files that already exist",
99
+ )
100
+
101
+ parser.add_argument(
102
+ "--log-level",
103
+ "-l",
104
+ choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
105
+ default=None,
106
+ help="Logging level (default: INFO)",
107
+ )
108
+
109
+ parser.add_argument(
110
+ "--verbose",
111
+ "-v",
112
+ action="store_const",
113
+ const="DEBUG",
114
+ dest="log_level_override",
115
+ help="Enable verbose output (equivalent to --log-level DEBUG)",
116
+ )
117
+
118
+ parser.add_argument(
119
+ "--quiet",
120
+ "-q",
121
+ action="store_const",
122
+ const="ERROR",
123
+ dest="log_level_override",
124
+ help="Suppress informational output (equivalent to --log-level ERROR)",
125
+ )
126
+
127
+ parser.add_argument(
128
+ "--dry-run",
129
+ action="store_true",
130
+ help="Show what would be fetched without actually downloading",
131
+ )
132
+
133
+ parser.add_argument(
134
+ "--max-pages",
135
+ type=int,
136
+ default=None,
137
+ help="Maximum number of pages to fetch (default: unlimited)",
138
+ )
139
+
140
+ parser.add_argument(
141
+ "--max-depth",
142
+ type=int,
143
+ default=5,
144
+ help="Maximum crawl depth when following links (default: 5)",
145
+ )
146
+
147
+ parser.add_argument(
148
+ "--max-concurrent",
149
+ type=int,
150
+ default=10,
151
+ help="Maximum concurrent requests for async fetching (default: 10)",
152
+ )
153
+
154
+ parser.add_argument(
155
+ "--js",
156
+ "--javascript",
157
+ action="store_true",
158
+ dest="use_js",
159
+ help="Enable JavaScript rendering with Playwright (slower but handles JS-heavy sites)",
160
+ )
161
+
162
+ parser.add_argument(
163
+ "--no-progress",
164
+ action="store_true",
165
+ help="Disable progress bars",
166
+ )
167
+
168
+ parser.add_argument(
169
+ "--log-file",
170
+ type=Path,
171
+ default=None,
172
+ help="Path to log file (default: console only)",
173
+ )
174
+
175
+ parser.add_argument(
176
+ "--generate-config",
177
+ type=Path,
178
+ metavar="PATH",
179
+ help="Generate a sample config file and exit",
180
+ )
181
+
182
+ parser.add_argument(
183
+ "--version",
184
+ action="version",
185
+ version=f"%(prog)s {__version__}",
186
+ )
187
+
188
+ return parser
189
+
190
+
191
+ def generate_sample_config(output_path: Path) -> None:
192
+ """
193
+ Generate a sample configuration file.
194
+
195
+ Args:
196
+ output_path: Path to save the config file
197
+ """
198
+ config = FetcherConfig()
199
+
200
+ # Determine format from extension
201
+ suffix = output_path.suffix.lower()
202
+
203
+ if suffix in [".yaml", ".yml"]:
204
+ config.save_yaml(output_path)
205
+ print(f"Sample YAML config generated: {output_path}")
206
+ elif suffix == ".json":
207
+ config.save_json(output_path)
208
+ print(f"Sample JSON config generated: {output_path}")
209
+ else:
210
+ print(f"Warning: Unknown extension {suffix}, generating YAML")
211
+ output_path = output_path.with_suffix(".yaml")
212
+ config.save_yaml(output_path)
213
+ print(f"Sample YAML config generated: {output_path}")
214
+
215
+
216
+ def get_config(args: argparse.Namespace) -> FetcherConfig:
217
+ """
218
+ Get configuration from args and config file.
219
+
220
+ Args:
221
+ args: Parsed command-line arguments
222
+
223
+ Returns:
224
+ FetcherConfig instance
225
+ """
226
+ # Load from config file if provided
227
+ config = FetcherConfig.from_file(args.config) if args.config else FetcherConfig()
228
+
229
+ # Override with command-line arguments
230
+ if args.output_dir is not None:
231
+ config.output_dir = args.output_dir
232
+
233
+ if args.sources is not None:
234
+ # Handle "all" keyword
235
+ if "all" in args.sources:
236
+ config.sources = ["bun", "d3", "nextjs", "plaid", "react", "stripe", "tailwind", "turborepo"]
237
+ else:
238
+ config.sources = args.sources
239
+
240
+ if args.rate_limit is not None:
241
+ config.rate_limit = args.rate_limit
242
+
243
+ if args.no_skip_existing:
244
+ config.skip_existing = False
245
+
246
+ # Handle log level (verbose/quiet shortcuts override --log-level)
247
+ if args.log_level_override is not None:
248
+ config.log_level = args.log_level_override
249
+ elif args.log_level is not None:
250
+ config.log_level = args.log_level
251
+
252
+ if args.log_file is not None:
253
+ config.log_file = str(args.log_file)
254
+
255
+ # Store dry-run flag in config
256
+ config.dry_run = args.dry_run
257
+
258
+ return config
259
+
260
+
261
+ def run_fetchers(config: FetcherConfig) -> int:
262
+ """
263
+ Run the fetchers based on configuration.
264
+
265
+ Args:
266
+ config: FetcherConfig instance
267
+
268
+ Returns:
269
+ Exit code (0 for success, 1 for error)
270
+ """
271
+ # Setup logging
272
+ logger = setup_logging(
273
+ level=config.log_level,
274
+ log_file=config.log_file,
275
+ )
276
+
277
+ logger.info("docpull - Documentation Fetcher")
278
+ logger.info(f"Mode: {'DRY RUN' if config.dry_run else 'FETCH'}")
279
+ logger.info(f"Output directory: {config.output_dir}")
280
+ logger.info(f"Rate limit: {config.rate_limit}s between requests")
281
+ logger.info(f"Skip existing: {config.skip_existing}")
282
+ logger.info(f"Sources: {', '.join(config.sources)}")
283
+ logger.info("")
284
+
285
+ if config.dry_run:
286
+ logger.info("DRY RUN MODE: No files will be downloaded")
287
+ logger.info("")
288
+
289
+ # Map source names to fetcher classes
290
+ fetcher_map = {
291
+ "bun": BunFetcher,
292
+ "d3": D3DevDocsFetcher,
293
+ "nextjs": NextJSFetcher,
294
+ "plaid": PlaidFetcher,
295
+ "react": ReactFetcher,
296
+ "stripe": StripeFetcher,
297
+ "tailwind": TailwindFetcher,
298
+ "turborepo": TurborepoFetcher,
299
+ }
300
+
301
+ # Run fetchers
302
+ errors = 0
303
+ for source in config.sources:
304
+ if source not in fetcher_map:
305
+ logger.error(f"Unknown source: {source}")
306
+ errors += 1
307
+ continue
308
+
309
+ try:
310
+ fetcher_class = fetcher_map[source]
311
+ fetcher = fetcher_class(
312
+ output_dir=config.output_dir,
313
+ rate_limit=config.rate_limit,
314
+ skip_existing=config.skip_existing,
315
+ logger=logger,
316
+ )
317
+ fetcher.fetch()
318
+ except Exception as e:
319
+ logger.error(f"Error fetching {source}: {e}", exc_info=True)
320
+ errors += 1
321
+
322
+ logger.info("")
323
+ if errors > 0:
324
+ logger.error(f"Completed with {errors} error(s)")
325
+ return 1
326
+ else:
327
+ logger.info("All documentation fetched successfully")
328
+ return 0
329
+
330
+
331
+ def run_generic_fetchers(args: argparse.Namespace) -> int:
332
+ """
333
+ Run generic fetchers for URLs or profile names.
334
+
335
+ Args:
336
+ args: Parsed command-line arguments
337
+
338
+ Returns:
339
+ Exit code (0 for success, 1 for error)
340
+ """
341
+ # Setup logging
342
+ log_level = args.log_level_override or args.log_level or "INFO"
343
+ logger = setup_logging(
344
+ level=log_level,
345
+ log_file=args.log_file,
346
+ )
347
+
348
+ output_dir = Path(args.output_dir) if args.output_dir else Path("./docs")
349
+ rate_limit = args.rate_limit if args.rate_limit is not None else 0.5
350
+ skip_existing = not args.no_skip_existing
351
+ max_pages = args.max_pages
352
+ max_depth = args.max_depth
353
+ max_concurrent = args.max_concurrent
354
+ use_js = args.use_js
355
+ show_progress = not args.no_progress
356
+
357
+ logger.info("docpull - Universal Documentation Fetcher")
358
+ logger.info(f"Targets: {', '.join(args.targets)}")
359
+ logger.info(f"Output directory: {output_dir}")
360
+ logger.info(f"Rate limit: {rate_limit}s between requests")
361
+ logger.info(f"Skip existing: {skip_existing}")
362
+ logger.info(f"Max concurrent: {max_concurrent}")
363
+ if max_pages:
364
+ logger.info(f"Max pages: {max_pages}")
365
+ logger.info(f"Max depth: {max_depth}")
366
+ if use_js:
367
+ logger.info("JavaScript rendering: ENABLED (slower but handles JS sites)")
368
+ logger.info("")
369
+
370
+ # Run async generic fetcher for each target
371
+ errors = 0
372
+ for target in args.targets:
373
+ try:
374
+ logger.info(f"Fetching: {target}")
375
+ fetcher = GenericAsyncFetcher(
376
+ url_or_profile=target,
377
+ output_dir=output_dir,
378
+ rate_limit=rate_limit,
379
+ skip_existing=skip_existing,
380
+ logger=logger,
381
+ max_pages=max_pages,
382
+ max_depth=max_depth,
383
+ max_concurrent=max_concurrent,
384
+ use_js=use_js,
385
+ show_progress=show_progress,
386
+ )
387
+ fetcher.fetch() # This calls asyncio.run() internally
388
+ except Exception as e:
389
+ logger.error(f"Error fetching {target}: {e}", exc_info=True)
390
+ errors += 1
391
+
392
+ logger.info("")
393
+ if errors > 0:
394
+ logger.error(f"Completed with {errors} error(s)")
395
+ return 1
396
+ else:
397
+ logger.info("All documentation fetched successfully")
398
+ return 0
399
+
400
+
401
+ def main(argv: Optional[list[str]] = None) -> int:
402
+ """
403
+ Main entry point for CLI.
404
+
405
+ Args:
406
+ argv: Command-line arguments (defaults to sys.argv)
407
+
408
+ Returns:
409
+ Exit code
410
+ """
411
+ parser = create_parser()
412
+ args = parser.parse_args(argv)
413
+
414
+ # Handle --generate-config
415
+ if args.generate_config:
416
+ try:
417
+ generate_sample_config(args.generate_config)
418
+ return 0
419
+ except Exception as e:
420
+ print(f"Error generating config: {e}", file=sys.stderr)
421
+ return 1
422
+
423
+ # Determine if using new URL-based interface or legacy source-based
424
+ use_generic = bool(args.targets)
425
+
426
+ if use_generic:
427
+ # New URL-based interface
428
+ return run_generic_fetchers(args)
429
+ else:
430
+ # Legacy source-based interface
431
+ try:
432
+ config = get_config(args)
433
+ except Exception as e:
434
+ print(f"Error loading configuration: {e}", file=sys.stderr)
435
+ return 1
436
+ return run_fetchers(config)
437
+
438
+
439
+ if __name__ == "__main__":
440
+ sys.exit(main())
docpull/config.py ADDED
@@ -0,0 +1,199 @@
1
+ import json
2
+ from pathlib import Path
3
+ from typing import Any, Optional
4
+
5
+ try:
6
+ import yaml # type: ignore
7
+ except ImportError:
8
+ yaml = None # type: ignore
9
+
10
+
11
+ class FetcherConfig:
12
+ """Configuration for documentation fetchers."""
13
+
14
+ def __init__(
15
+ self,
16
+ output_dir: str = "./docs",
17
+ rate_limit: float = 0.5,
18
+ skip_existing: bool = True,
19
+ log_level: str = "INFO",
20
+ log_file: Optional[str] = None,
21
+ sources: Optional[list[str]] = None,
22
+ dry_run: bool = False,
23
+ ):
24
+ """
25
+ Initialize configuration.
26
+
27
+ Args:
28
+ output_dir: Directory to save documentation
29
+ rate_limit: Seconds between requests
30
+ skip_existing: Skip existing files
31
+ log_level: Logging level
32
+ log_file: Optional log file path
33
+ sources: List of sources to fetch (e.g., ['stripe', 'plaid'])
34
+ dry_run: Dry run mode (don't download files)
35
+ """
36
+ self.output_dir = Path(output_dir)
37
+ self.rate_limit = rate_limit
38
+ self.skip_existing = skip_existing
39
+ self.log_level = log_level
40
+ self.log_file = log_file
41
+ self.sources = sources or ["plaid", "stripe"]
42
+ self.dry_run = dry_run
43
+
44
+ @classmethod
45
+ def from_dict(cls, config_dict: dict[str, Any]) -> "FetcherConfig":
46
+ """
47
+ Create configuration from dictionary.
48
+
49
+ Args:
50
+ config_dict: Configuration dictionary
51
+
52
+ Returns:
53
+ FetcherConfig instance
54
+
55
+ Raises:
56
+ ValueError: If configuration values are invalid
57
+ """
58
+ # Validate output_dir doesn't contain path traversal
59
+ output_dir = str(config_dict.get("output_dir", "./docs"))
60
+ if ".." in output_dir or output_dir.startswith("/etc") or output_dir.startswith("/sys"):
61
+ raise ValueError("Invalid output directory path")
62
+
63
+ # Validate rate_limit is reasonable
64
+ rate_limit = config_dict.get("rate_limit", 0.5)
65
+ if not isinstance(rate_limit, (int, float)) or rate_limit < 0 or rate_limit > 60:
66
+ raise ValueError("rate_limit must be between 0 and 60")
67
+
68
+ # Validate sources
69
+ valid_sources = {"bun", "d3", "nextjs", "plaid", "react", "stripe", "tailwind", "turborepo"}
70
+ sources = config_dict.get("sources", ["plaid", "stripe"])
71
+ if not all(s in valid_sources for s in sources):
72
+ raise ValueError(f"Invalid sources. Must be from: {valid_sources}")
73
+
74
+ # Validate log_level
75
+ log_level = config_dict.get("log_level", "INFO")
76
+ valid_log_levels = {"DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"}
77
+ if log_level.upper() not in valid_log_levels:
78
+ raise ValueError(f"Invalid log_level. Must be one of: {valid_log_levels}")
79
+
80
+ return cls(
81
+ output_dir=output_dir,
82
+ rate_limit=rate_limit,
83
+ skip_existing=config_dict.get("skip_existing", True),
84
+ log_level=log_level,
85
+ log_file=config_dict.get("log_file"),
86
+ sources=sources,
87
+ dry_run=config_dict.get("dry_run", False),
88
+ )
89
+
90
+ @classmethod
91
+ def from_yaml(cls, yaml_path: Path) -> "FetcherConfig":
92
+ """
93
+ Load configuration from YAML file.
94
+
95
+ Args:
96
+ yaml_path: Path to YAML config file
97
+
98
+ Returns:
99
+ FetcherConfig instance
100
+
101
+ Raises:
102
+ ImportError: If pyyaml is not installed
103
+ FileNotFoundError: If config file doesn't exist
104
+ """
105
+ if yaml is None:
106
+ raise ImportError("PyYAML is required for YAML config. Install with: pip install pyyaml")
107
+
108
+ if not yaml_path.exists():
109
+ raise FileNotFoundError(f"Config file not found: {yaml_path}")
110
+
111
+ with open(yaml_path) as f:
112
+ config_dict = yaml.safe_load(f)
113
+
114
+ return cls.from_dict(config_dict)
115
+
116
+ @classmethod
117
+ def from_json(cls, json_path: Path) -> "FetcherConfig":
118
+ """
119
+ Load configuration from JSON file.
120
+
121
+ Args:
122
+ json_path: Path to JSON config file
123
+
124
+ Returns:
125
+ FetcherConfig instance
126
+
127
+ Raises:
128
+ FileNotFoundError: If config file doesn't exist
129
+ """
130
+ if not json_path.exists():
131
+ raise FileNotFoundError(f"Config file not found: {json_path}")
132
+
133
+ with open(json_path) as f:
134
+ config_dict = json.load(f)
135
+
136
+ return cls.from_dict(config_dict)
137
+
138
+ @classmethod
139
+ def from_file(cls, config_path: Path) -> "FetcherConfig":
140
+ """
141
+ Load configuration from file (auto-detect format).
142
+
143
+ Args:
144
+ config_path: Path to config file
145
+
146
+ Returns:
147
+ FetcherConfig instance
148
+ """
149
+ suffix = config_path.suffix.lower()
150
+
151
+ if suffix in [".yaml", ".yml"]:
152
+ return cls.from_yaml(config_path)
153
+ elif suffix == ".json":
154
+ return cls.from_json(config_path)
155
+ else:
156
+ raise ValueError(f"Unsupported config file format: {suffix}")
157
+
158
+ def to_dict(self) -> dict[str, Any]:
159
+ """
160
+ Convert configuration to dictionary.
161
+
162
+ Returns:
163
+ Configuration as dictionary
164
+ """
165
+ return {
166
+ "output_dir": str(self.output_dir),
167
+ "rate_limit": self.rate_limit,
168
+ "skip_existing": self.skip_existing,
169
+ "log_level": self.log_level,
170
+ "log_file": self.log_file,
171
+ "sources": self.sources,
172
+ "dry_run": self.dry_run,
173
+ }
174
+
175
+ def save_yaml(self, yaml_path: Path) -> None:
176
+ """
177
+ Save configuration to YAML file.
178
+
179
+ Args:
180
+ yaml_path: Path to save YAML config
181
+
182
+ Raises:
183
+ ImportError: If pyyaml is not installed
184
+ """
185
+ if yaml is None:
186
+ raise ImportError("PyYAML is required for YAML config. Install with: pip install pyyaml")
187
+
188
+ with open(yaml_path, "w") as f:
189
+ yaml.dump(self.to_dict(), f, default_flow_style=False, sort_keys=False)
190
+
191
+ def save_json(self, json_path: Path) -> None:
192
+ """
193
+ Save configuration to JSON file.
194
+
195
+ Args:
196
+ json_path: Path to save JSON config
197
+ """
198
+ with open(json_path, "w") as f:
199
+ json.dump(self.to_dict(), f, indent=2)
@@ -0,0 +1,23 @@
1
+ from .base import BaseFetcher
2
+ from .bun import BunFetcher
3
+ from .d3 import D3DevDocsFetcher
4
+ from .nextjs import NextJSFetcher
5
+ from .parallel_base import ParallelFetcher
6
+ from .plaid import PlaidFetcher
7
+ from .react import ReactFetcher
8
+ from .stripe import StripeFetcher
9
+ from .tailwind import TailwindFetcher
10
+ from .turborepo import TurborepoFetcher
11
+
12
+ __all__ = [
13
+ "BaseFetcher",
14
+ "BunFetcher",
15
+ "D3DevDocsFetcher",
16
+ "NextJSFetcher",
17
+ "ParallelFetcher",
18
+ "PlaidFetcher",
19
+ "ReactFetcher",
20
+ "StripeFetcher",
21
+ "TailwindFetcher",
22
+ "TurborepoFetcher",
23
+ ]