capcat 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- capcat-1.0.0/LICENSE.txt +42 -0
- capcat-1.0.0/PKG-INFO +384 -0
- capcat-1.0.0/README.md +303 -0
- capcat-1.0.0/capcat/__init__.py +4 -0
- capcat-1.0.0/capcat/__main__.py +4 -0
- capcat-1.0.0/capcat/cli.py +440 -0
- capcat-1.0.0/capcat/commands/__init__.py +0 -0
- capcat-1.0.0/capcat/commands/add_source.py +23 -0
- capcat-1.0.0/capcat/commands/fetch.py +73 -0
- capcat-1.0.0/capcat/commands/generate_config.py +32 -0
- capcat-1.0.0/capcat/commands/init.py +74 -0
- capcat-1.0.0/capcat/commands/remove_source.py +60 -0
- capcat-1.0.0/capcat/commands/single.py +219 -0
- capcat-1.0.0/capcat/core/__init__.py +0 -0
- capcat-1.0.0/capcat/core/article_fetcher.py +2720 -0
- capcat-1.0.0/capcat/core/circuit_breaker.py +479 -0
- capcat-1.0.0/capcat/core/cli_recovery.py +184 -0
- capcat-1.0.0/capcat/core/cli_validation.py +161 -0
- capcat-1.0.0/capcat/core/command_logging.py +157 -0
- capcat-1.0.0/capcat/core/config/__init__.py +216 -0
- capcat-1.0.0/capcat/core/config/source_base.py +321 -0
- capcat-1.0.0/capcat/core/config/source_registry.py +443 -0
- capcat-1.0.0/capcat/core/config.py +476 -0
- capcat-1.0.0/capcat/core/constants.py +58 -0
- capcat-1.0.0/capcat/core/conversion_executor.py +70 -0
- capcat-1.0.0/capcat/core/design_system_compiler.py +465 -0
- capcat-1.0.0/capcat/core/downloader.py +427 -0
- capcat-1.0.0/capcat/core/enhanced_argparse.py +113 -0
- capcat-1.0.0/capcat/core/error_handling.py +485 -0
- capcat-1.0.0/capcat/core/ethical_scraping.py +369 -0
- capcat-1.0.0/capcat/core/exceptions.py +185 -0
- capcat-1.0.0/capcat/core/formatter.py +1172 -0
- capcat-1.0.0/capcat/core/html_generator.py +1716 -0
- capcat-1.0.0/capcat/core/html_post_processor.py +661 -0
- capcat-1.0.0/capcat/core/image_processor.py +541 -0
- capcat-1.0.0/capcat/core/interactive.py +676 -0
- capcat-1.0.0/capcat/core/logging_config.py +203 -0
- capcat-1.0.0/capcat/core/media_config.py +289 -0
- capcat-1.0.0/capcat/core/media_executor.py +69 -0
- capcat-1.0.0/capcat/core/media_processor.py +906 -0
- capcat-1.0.0/capcat/core/network_resilience.py +569 -0
- capcat-1.0.0/capcat/core/news_source_adapter.py +1117 -0
- capcat-1.0.0/capcat/core/progress.py +1096 -0
- capcat-1.0.0/capcat/core/rate_limiter.py +281 -0
- capcat-1.0.0/capcat/core/retry.py +212 -0
- capcat-1.0.0/capcat/core/retry_skip.py +181 -0
- capcat-1.0.0/capcat/core/session_pool.py +186 -0
- capcat-1.0.0/capcat/core/shutdown.py +194 -0
- capcat-1.0.0/capcat/core/source_config.py +91 -0
- capcat-1.0.0/capcat/core/source_configs.py +133 -0
- capcat-1.0.0/capcat/core/source_factory.py +53 -0
- capcat-1.0.0/capcat/core/source_system/__init__.py +0 -0
- capcat-1.0.0/capcat/core/source_system/add_source_command.py +313 -0
- capcat-1.0.0/capcat/core/source_system/add_source_service.py +85 -0
- capcat-1.0.0/capcat/core/source_system/base_source.py +458 -0
- capcat-1.0.0/capcat/core/source_system/bundle_manager.py +446 -0
- capcat-1.0.0/capcat/core/source_system/bundle_models.py +61 -0
- capcat-1.0.0/capcat/core/source_system/bundle_service.py +472 -0
- capcat-1.0.0/capcat/core/source_system/bundle_ui.py +398 -0
- capcat-1.0.0/capcat/core/source_system/bundle_validator.py +220 -0
- capcat-1.0.0/capcat/core/source_system/config_driven_source.py +225 -0
- capcat-1.0.0/capcat/core/source_system/discovery_strategies.py +449 -0
- capcat-1.0.0/capcat/core/source_system/enhanced_remove_command.py +662 -0
- capcat-1.0.0/capcat/core/source_system/feed_discovery.py +186 -0
- capcat-1.0.0/capcat/core/source_system/feed_parser.py +293 -0
- capcat-1.0.0/capcat/core/source_system/performance_monitor.py +438 -0
- capcat-1.0.0/capcat/core/source_system/questionary_ui.py +221 -0
- capcat-1.0.0/capcat/core/source_system/removal_ui.py +166 -0
- capcat-1.0.0/capcat/core/source_system/remove_source_service.py +79 -0
- capcat-1.0.0/capcat/core/source_system/rss_feed_introspector.py +73 -0
- capcat-1.0.0/capcat/core/source_system/source_analytics.py +319 -0
- capcat-1.0.0/capcat/core/source_system/source_backup_manager.py +293 -0
- capcat-1.0.0/capcat/core/source_system/source_config.py +82 -0
- capcat-1.0.0/capcat/core/source_system/source_config_generator.py +66 -0
- capcat-1.0.0/capcat/core/source_system/source_factory.py +388 -0
- capcat-1.0.0/capcat/core/source_system/source_registry.py +488 -0
- capcat-1.0.0/capcat/core/source_system/validation_engine.py +888 -0
- capcat-1.0.0/capcat/core/specialized_source_manager.py +243 -0
- capcat-1.0.0/capcat/core/storage_manager.py +108 -0
- capcat-1.0.0/capcat/core/streamlined_comment_processor.py +361 -0
- capcat-1.0.0/capcat/core/template_renderer.py +158 -0
- capcat-1.0.0/capcat/core/theme_utils.py +78 -0
- capcat-1.0.0/capcat/core/timeout_config.py +332 -0
- capcat-1.0.0/capcat/core/timeout_wrapper.py +87 -0
- capcat-1.0.0/capcat/core/unified_article_processor.py +347 -0
- capcat-1.0.0/capcat/core/unified_media_processor.py +133 -0
- capcat-1.0.0/capcat/core/unified_source_processor.py +807 -0
- capcat-1.0.0/capcat/core/update_manager.py +420 -0
- capcat-1.0.0/capcat/core/url_utils.py +241 -0
- capcat-1.0.0/capcat/core/utils.py +277 -0
- capcat-1.0.0/capcat/htmlgen/__init__.py +88 -0
- capcat-1.0.0/capcat/htmlgen/base/base_generator.py +574 -0
- capcat-1.0.0/capcat/htmlgen/hn/generator.py +390 -0
- capcat-1.0.0/capcat/htmlgen/lb/generator.py +446 -0
- capcat-1.0.0/capcat/htmlgen/lesswrong/generator.py +426 -0
- capcat-1.0.0/capcat/sources/__init__.py +0 -0
- capcat-1.0.0/capcat/sources/base/__init__.py +7 -0
- capcat-1.0.0/capcat/sources/base/config_schema.py +168 -0
- capcat-1.0.0/capcat/sources/base/factory.py +103 -0
- capcat-1.0.0/capcat/sources/builtin/__init__.py +0 -0
- capcat-1.0.0/capcat/sources/builtin/bundles.yml +48 -0
- capcat-1.0.0/capcat/sources/builtin/business_sources.yml +3 -0
- capcat-1.0.0/capcat/sources/builtin/config_driven/configs/bbc.yaml +63 -0
- capcat-1.0.0/capcat/sources/builtin/config_driven/configs/bbcsport.yaml +70 -0
- capcat-1.0.0/capcat/sources/builtin/config_driven/configs/google-reserch.yml +12 -0
- capcat-1.0.0/capcat/sources/builtin/config_driven/configs/guardian.yaml +95 -0
- capcat-1.0.0/capcat/sources/builtin/config_driven/configs/ieee.yaml +73 -0
- capcat-1.0.0/capcat/sources/builtin/config_driven/configs/iq.yaml +67 -0
- capcat-1.0.0/capcat/sources/builtin/config_driven/configs/mashable.yml +12 -0
- capcat-1.0.0/capcat/sources/builtin/config_driven/configs/mitnews.yaml +89 -0
- capcat-1.0.0/capcat/sources/builtin/config_driven/configs/nature.yaml +69 -0
- capcat-1.0.0/capcat/sources/builtin/config_driven/configs/scientificamerican.yaml +67 -0
- capcat-1.0.0/capcat/sources/builtin/custom/hn/config.yaml +21 -0
- capcat-1.0.0/capcat/sources/builtin/custom/hn/source.py +387 -0
- capcat-1.0.0/capcat/sources/builtin/custom/lb/config.yaml +26 -0
- capcat-1.0.0/capcat/sources/builtin/custom/lb/source.py +702 -0
- capcat-1.0.0/capcat/sources/builtin/news_sources.yml +13 -0
- capcat-1.0.0/capcat/sources/builtin/science_sources.yml +20 -0
- capcat-1.0.0/capcat/sources/builtin/tech_sources.yml +55 -0
- capcat-1.0.0/capcat/sources/specialized/__init__.py +50 -0
- capcat-1.0.0/capcat/sources/specialized/medium/source.py +623 -0
- capcat-1.0.0/capcat/sources/specialized/substack/source.py +711 -0
- capcat-1.0.0/capcat/sources/specialized/twitter/__init__.py +4 -0
- capcat-1.0.0/capcat/sources/specialized/twitter/source.py +84 -0
- capcat-1.0.0/capcat/sources/specialized/vimeo/__init__.py +4 -0
- capcat-1.0.0/capcat/sources/specialized/vimeo/source.py +164 -0
- capcat-1.0.0/capcat/sources/specialized/youtube/__init__.py +4 -0
- capcat-1.0.0/capcat/sources/specialized/youtube/source.py +132 -0
- capcat-1.0.0/capcat/themes/DS-logic.md +75 -0
- capcat-1.0.0/capcat/themes/Space-Grotesk/SpaceGrotesk-Bold.woff +0 -0
- capcat-1.0.0/capcat/themes/Space-Grotesk/SpaceGrotesk-Bold.woff2 +0 -0
- capcat-1.0.0/capcat/themes/Space-Grotesk/SpaceGrotesk-Light.woff +0 -0
- capcat-1.0.0/capcat/themes/Space-Grotesk/SpaceGrotesk-Light.woff2 +0 -0
- capcat-1.0.0/capcat/themes/Space-Grotesk/SpaceGrotesk-Medium.woff +0 -0
- capcat-1.0.0/capcat/themes/Space-Grotesk/SpaceGrotesk-Medium.woff2 +0 -0
- capcat-1.0.0/capcat/themes/Space-Grotesk/SpaceGrotesk-Regular.woff +0 -0
- capcat-1.0.0/capcat/themes/Space-Grotesk/SpaceGrotesk-Regular.woff2 +0 -0
- capcat-1.0.0/capcat/themes/Space-Grotesk/SpaceGrotesk-SemiBold.woff +0 -0
- capcat-1.0.0/capcat/themes/Space-Grotesk/SpaceGrotesk-SemiBold.woff2 +0 -0
- capcat-1.0.0/capcat/themes/base.css +1529 -0
- capcat-1.0.0/capcat/themes/design-system.css +518 -0
- capcat-1.0.0/capcat/themes/js/capcat.js +273 -0
- capcat-1.0.0/capcat/themes/test new orange +2 -0
- capcat-1.0.0/capcat/tui.py +8 -0
- capcat-1.0.0/capcat.egg-info/PKG-INFO +384 -0
- capcat-1.0.0/capcat.egg-info/SOURCES.txt +191 -0
- capcat-1.0.0/capcat.egg-info/dependency_links.txt +1 -0
- capcat-1.0.0/capcat.egg-info/entry_points.txt +2 -0
- capcat-1.0.0/capcat.egg-info/requires.txt +25 -0
- capcat-1.0.0/capcat.egg-info/top_level.txt +1 -0
- capcat-1.0.0/pyproject.toml +180 -0
- capcat-1.0.0/setup.cfg +4 -0
- capcat-1.0.0/tests/test_add_source_command_refactored.py +545 -0
- capcat-1.0.0/tests/test_add_source_service.py +188 -0
- capcat-1.0.0/tests/test_batch_specialized_integration.py +398 -0
- capcat-1.0.0/tests/test_bundle_manager.py +67 -0
- capcat-1.0.0/tests/test_bundle_manager_remove.py +150 -0
- capcat-1.0.0/tests/test_bundle_validator.py +566 -0
- capcat-1.0.0/tests/test_cleanup.py +249 -0
- capcat-1.0.0/tests/test_cli_add_source.py +88 -0
- capcat-1.0.0/tests/test_discovery_strategies.py +292 -0
- capcat-1.0.0/tests/test_error_handling.py +476 -0
- capcat-1.0.0/tests/test_ethical_scraping.py +61 -0
- capcat-1.0.0/tests/test_feed_date_sorting.py +110 -0
- capcat-1.0.0/tests/test_feed_parser.py +232 -0
- capcat-1.0.0/tests/test_fetch_arguments.py +337 -0
- capcat-1.0.0/tests/test_fetch_command.py +43 -0
- capcat-1.0.0/tests/test_formatter_none_handling.py +83 -0
- capcat-1.0.0/tests/test_help_examples.py +394 -0
- capcat-1.0.0/tests/test_hn_pagination.py +369 -0
- capcat-1.0.0/tests/test_index_filename_detection.py +138 -0
- capcat-1.0.0/tests/test_interactive.py +67 -0
- capcat-1.0.0/tests/test_list_command.py +335 -0
- capcat-1.0.0/tests/test_lobsters_pagination.py +395 -0
- capcat-1.0.0/tests/test_media_processor_step1.py +297 -0
- capcat-1.0.0/tests/test_media_processor_step2.py +228 -0
- capcat-1.0.0/tests/test_pdf_handling.py +201 -0
- capcat-1.0.0/tests/test_pdf_skip_prompt.py +466 -0
- capcat-1.0.0/tests/test_pep8_refactoring.py +337 -0
- capcat-1.0.0/tests/test_remove_source_command.py +410 -0
- capcat-1.0.0/tests/test_retry_skip_logic.py +464 -0
- capcat-1.0.0/tests/test_rss_feed_introspector.py +96 -0
- capcat-1.0.0/tests/test_single_command.py +47 -0
- capcat-1.0.0/tests/test_source_analytics.py +273 -0
- capcat-1.0.0/tests/test_source_backup_manager.py +202 -0
- capcat-1.0.0/tests/test_source_config_generator.py +62 -0
- capcat-1.0.0/tests/test_source_management_menu.py +503 -0
- capcat-1.0.0/tests/test_theme_hash_persistence.py +118 -0
- capcat-1.0.0/tests/test_thread_safe_timeout.py +271 -0
- capcat-1.0.0/tests/test_title_truncation.py +115 -0
- capcat-1.0.0/tests/test_unified_article_processor.py +269 -0
- capcat-1.0.0/tests/test_url_replacement.py +379 -0
- capcat-1.0.0/tests/test_video_sources.py +138 -0
capcat-1.0.0/LICENSE.txt
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
MIT-Style Non-Commercial License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Stayu Kasabov
|
|
4
|
+
|
|
5
|
+
Original Product: Capcat - News Article Archiving System
|
|
6
|
+
Author: Stayu Kasabov | https://stayux.com
|
|
7
|
+
Product Designer with Holistic Production Expertise
|
|
8
|
+
|
|
9
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
10
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
11
|
+
in the Software without restriction for NON-COMMERCIAL PURPOSES ONLY,
|
|
12
|
+
including without limitation the rights to use, copy, modify, merge, publish,
|
|
13
|
+
distribute, sublicense, and to permit persons to whom the Software is
|
|
14
|
+
furnished to do so, subject to the following conditions:
|
|
15
|
+
|
|
16
|
+
1. NON-COMMERCIAL USE ONLY: This software may not be used for commercial
|
|
17
|
+
purposes. Commercial purposes include, but are not limited to: selling
|
|
18
|
+
the software, using it in a commercial product or service, or using it
|
|
19
|
+
to generate revenue.
|
|
20
|
+
|
|
21
|
+
2. ATTRIBUTION: The above copyright notice and this permission notice shall
|
|
22
|
+
be included in all copies or substantial portions of the Software. Credit
|
|
23
|
+
must be given to the original author: Stayu Kasabov (https://stayux.com)
|
|
24
|
+
|
|
25
|
+
3. SHARE ALIKE: Any modifications or derivative works must be released under
|
|
26
|
+
the same non-commercial terms.
|
|
27
|
+
|
|
28
|
+
4. CONTRIBUTIONS WELCOME: Users are encouraged to contribute improvements
|
|
29
|
+
back to the original project.
|
|
30
|
+
|
|
31
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
32
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
33
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
34
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
35
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
36
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
37
|
+
SOFTWARE.
|
|
38
|
+
|
|
39
|
+
---
|
|
40
|
+
|
|
41
|
+
A free and open-source tool to make people's lives easier.
|
|
42
|
+
Contributions welcome! Contact: Stayu Kasabov | https://stayux.com
|
capcat-1.0.0/PKG-INFO
ADDED
|
@@ -0,0 +1,384 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: capcat
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: A command-line tool designed to solve content preservation challenges with Ethical Scraping.
|
|
5
|
+
Author: Stayu Kasabov - Product Designer and Experiences Builder | AI-powered Prototyping & MVP | Strategic Generalist
|
|
6
|
+
License: MIT-Style Non-Commercial License
|
|
7
|
+
|
|
8
|
+
Copyright (c) 2025 Stayu Kasabov
|
|
9
|
+
|
|
10
|
+
Original Product: Capcat - News Article Archiving System
|
|
11
|
+
Author: Stayu Kasabov | https://stayux.com
|
|
12
|
+
Product Designer with Holistic Production Expertise
|
|
13
|
+
|
|
14
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
15
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
16
|
+
in the Software without restriction for NON-COMMERCIAL PURPOSES ONLY,
|
|
17
|
+
including without limitation the rights to use, copy, modify, merge, publish,
|
|
18
|
+
distribute, sublicense, and to permit persons to whom the Software is
|
|
19
|
+
furnished to do so, subject to the following conditions:
|
|
20
|
+
|
|
21
|
+
1. NON-COMMERCIAL USE ONLY: This software may not be used for commercial
|
|
22
|
+
purposes. Commercial purposes include, but are not limited to: selling
|
|
23
|
+
the software, using it in a commercial product or service, or using it
|
|
24
|
+
to generate revenue.
|
|
25
|
+
|
|
26
|
+
2. ATTRIBUTION: The above copyright notice and this permission notice shall
|
|
27
|
+
be included in all copies or substantial portions of the Software. Credit
|
|
28
|
+
must be given to the original author: Stayu Kasabov (https://stayux.com)
|
|
29
|
+
|
|
30
|
+
3. SHARE ALIKE: Any modifications or derivative works must be released under
|
|
31
|
+
the same non-commercial terms.
|
|
32
|
+
|
|
33
|
+
4. CONTRIBUTIONS WELCOME: Users are encouraged to contribute improvements
|
|
34
|
+
back to the original project.
|
|
35
|
+
|
|
36
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
37
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
38
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
39
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
40
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
41
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
42
|
+
SOFTWARE.
|
|
43
|
+
|
|
44
|
+
---
|
|
45
|
+
|
|
46
|
+
A free and open-source tool to make people's lives easier.
|
|
47
|
+
Contributions welcome! Contact: Stayu Kasabov | https://stayux.com
|
|
48
|
+
Project-URL: Homepage, https://github.com/<owner>/capcat
|
|
49
|
+
Classifier: Programming Language :: Python :: 3
|
|
50
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
51
|
+
Classifier: Operating System :: OS Independent
|
|
52
|
+
Classifier: Environment :: Console
|
|
53
|
+
Requires-Python: >=3.8
|
|
54
|
+
Description-Content-Type: text/markdown
|
|
55
|
+
License-File: LICENSE.txt
|
|
56
|
+
Requires-Dist: requests<3.0.0,>=2.28.0
|
|
57
|
+
Requires-Dist: beautifulsoup4<5.0.0,>=4.12.0
|
|
58
|
+
Requires-Dist: PyYAML<7.0,>=6.0
|
|
59
|
+
Requires-Dist: feedparser<7.0,>=6.0
|
|
60
|
+
Requires-Dist: questionary<3.0,>=2.0
|
|
61
|
+
Requires-Dist: markdownify<1.0,>=0.11
|
|
62
|
+
Requires-Dist: lxml<6.0,>=4.9
|
|
63
|
+
Requires-Dist: ruamel.yaml<0.19,>=0.17
|
|
64
|
+
Requires-Dist: validators<1.0,>=0.20
|
|
65
|
+
Requires-Dist: prompt_toolkit<4.0,>=3.0
|
|
66
|
+
Requires-Dist: yt-dlp<2027.0.0,>=2023.1.6
|
|
67
|
+
Requires-Dist: markdown<4.0,>=3.5
|
|
68
|
+
Requires-Dist: pygments<3.0,>=2.16
|
|
69
|
+
Requires-Dist: charset-normalizer<4.0,>=3.0
|
|
70
|
+
Requires-Dist: brotli<2.0,>=1.0
|
|
71
|
+
Requires-Dist: pynput<2.0,>=1.7.6
|
|
72
|
+
Requires-Dist: rich<15.0,>=13.0
|
|
73
|
+
Provides-Extra: dev
|
|
74
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
75
|
+
Requires-Dist: pytest-cov>=4.0; extra == "dev"
|
|
76
|
+
Requires-Dist: mypy>=1.0; extra == "dev"
|
|
77
|
+
Requires-Dist: ruff>=0.1; extra == "dev"
|
|
78
|
+
Requires-Dist: build>=1.0; extra == "dev"
|
|
79
|
+
Requires-Dist: twine>=5.0; extra == "dev"
|
|
80
|
+
Dynamic: license-file
|
|
81
|
+
|
|
82
|
+
# Capcat - Archive and Share Articles with Confidence
|
|
83
|
+
|
|
84
|
+
A dual-mode news archiving tool that captures articles from 12 curated sources as **clean Markdown files** (Obsidian-ready) with optional **self-contained HTML** output - perfect for knowledge management and offline sharing.
|
|
85
|
+
|
|
86
|
+
## Why Capcat?
|
|
87
|
+
|
|
88
|
+
**Build Your Knowledge Base**: Every article saved as clean Markdown - drop directly into Obsidian for full-text search, backlinks, and graph views. Perfect for researchers and lifelong learners.
|
|
89
|
+
|
|
90
|
+
**Share Without Breaking**: Optional self-contained HTML output with all styles and scripts embedded. Send to anyone, open anywhere, years later - it just works.
|
|
91
|
+
|
|
92
|
+
**Two Ways to Use**:
|
|
93
|
+
- **Interactive Menu** (`./capcat catch`) - Visual interface for browsing sources and bundles
|
|
94
|
+
- **Command Line** - Fast automation for power users
|
|
95
|
+
|
|
96
|
+
**Curated Bundles**: Pre-configured collections like Tech, AI, Science, News - fetch multiple related sources at once.
|
|
97
|
+
|
|
98
|
+
## Quick Start
|
|
99
|
+
|
|
100
|
+
### Interactive Mode (Recommended)
|
|
101
|
+
|
|
102
|
+
```bash
|
|
103
|
+
./capcat catch
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
Choose from:
|
|
107
|
+
- **Fetch by Source** - Browse 12 curated sources (Hacker News, BBC, IEEE, Nature, etc.)
|
|
108
|
+
- **Fetch by Bundle** - Curated collections (Tech, AI, Science, News, Sports)
|
|
109
|
+
- **Single Article** - Archive any URL instantly
|
|
110
|
+
- **Source Management** - Add custom RSS/news sources
|
|
111
|
+
|
|
112
|
+
### Command Line Mode
|
|
113
|
+
|
|
114
|
+
```bash
|
|
115
|
+
# Fetch curated tech bundle (IEEE + Mashable)
|
|
116
|
+
./capcat bundle tech --count 10
|
|
117
|
+
|
|
118
|
+
# Fetch specific sources with media
|
|
119
|
+
./capcat fetch hn,bbc --count 15 --media
|
|
120
|
+
|
|
121
|
+
# Archive a single article
|
|
122
|
+
./capcat single https://example.com/article
|
|
123
|
+
|
|
124
|
+
# List all available sources
|
|
125
|
+
./capcat list sources
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
## Key Features
|
|
129
|
+
|
|
130
|
+
### Self-Contained HTML for Easy Sharing
|
|
131
|
+
|
|
132
|
+
Every article is a **complete, portable HTML file**:
|
|
133
|
+
- **Embedded CSS** - All styles inline, no external stylesheets
|
|
134
|
+
- **Embedded JavaScript** - Interactive features work offline
|
|
135
|
+
- **Local Images** - Downloaded and stored with the article
|
|
136
|
+
- **No Dependencies** - Open in any browser, share via email, archive forever
|
|
137
|
+
|
|
138
|
+
**Perfect for**:
|
|
139
|
+
- Email attachments that always look right
|
|
140
|
+
- Long-term archiving without link rot
|
|
141
|
+
- Offline reading on any device
|
|
142
|
+
- Sharing articles that might disappear
|
|
143
|
+
|
|
144
|
+
### Dual Interface
|
|
145
|
+
|
|
146
|
+
**Interactive Menu** (`./capcat catch`):
|
|
147
|
+
- Visual source selection
|
|
148
|
+
- Bundle browsing
|
|
149
|
+
- Progress tracking
|
|
150
|
+
- Error handling with retries
|
|
151
|
+
- No commands to memorize
|
|
152
|
+
|
|
153
|
+
**Command Line**:
|
|
154
|
+
- Fast automation and scripting
|
|
155
|
+
- Batch processing
|
|
156
|
+
- CI/CD integration
|
|
157
|
+
- Power user workflows
|
|
158
|
+
|
|
159
|
+
### Smart Content Extraction
|
|
160
|
+
|
|
161
|
+
- **12 Curated Sources** - HN, BBC, Guardian, Nature, IEEE, Scientific American, MIT News, and more
|
|
162
|
+
- **Intelligent Fallback** - Finds images even when primary extraction misses them
|
|
163
|
+
- **Comment Preservation** - Captures discussions with privacy anonymization
|
|
164
|
+
- **Media Handling** - Images always downloaded, video/audio/PDFs with `--media` flag
|
|
165
|
+
|
|
166
|
+
### Markdown-Native Output
|
|
167
|
+
|
|
168
|
+
- **Obsidian-Ready** - Clean markdown files you can drop directly into your vault
|
|
169
|
+
- **Portable Archives** - Standard markdown format works everywhere
|
|
170
|
+
- **Local Images** - All media downloaded and referenced with relative paths
|
|
171
|
+
- **Metadata Headers** - Source, date, and URL preserved in frontmatter-style headers
|
|
172
|
+
|
|
173
|
+
### Bundle System
|
|
174
|
+
|
|
175
|
+
Pre-configured topic collections:
|
|
176
|
+
|
|
177
|
+
| Bundle | Sources | Description |
|
|
178
|
+
|--------|---------|-------------|
|
|
179
|
+
| `tech` | IEEE, Mashable | Consumer technology news |
|
|
180
|
+
| `techpro` | HN, Lobsters, InfoQ | Professional developer news |
|
|
181
|
+
| `ai` | MIT News, Google Research | AI research and developments |
|
|
182
|
+
| `science` | Nature, Scientific American | Scientific publications |
|
|
183
|
+
| `news` | BBC, Guardian | General news |
|
|
184
|
+
| `sports` | BBC Sport | Sports coverage |
|
|
185
|
+
|
|
186
|
+
Add your own bundles in `sources/active/bundles.yml`.
|
|
187
|
+
|
|
188
|
+
## Installation
|
|
189
|
+
|
|
190
|
+
### Quick Setup
|
|
191
|
+
|
|
192
|
+
```bash
|
|
193
|
+
# Clone the repository
|
|
194
|
+
git clone https://github.com/stayukasabov/capcat.git
|
|
195
|
+
cd capcat/Application
|
|
196
|
+
|
|
197
|
+
# Auto-fix dependencies (recommended)
|
|
198
|
+
./scripts/fix_dependencies.sh
|
|
199
|
+
|
|
200
|
+
# Or manual setup
|
|
201
|
+
python3 -m venv venv
|
|
202
|
+
source venv/bin/activate
|
|
203
|
+
pip install -r requirements.txt
|
|
204
|
+
```
|
|
205
|
+
|
|
206
|
+
### First Run
|
|
207
|
+
|
|
208
|
+
```bash
|
|
209
|
+
# Launch interactive menu
|
|
210
|
+
./capcat catch
|
|
211
|
+
|
|
212
|
+
# Or try a quick fetch
|
|
213
|
+
./capcat fetch hn --count 5
|
|
214
|
+
```
|
|
215
|
+
|
|
216
|
+
## Markdown-First Workflow (Obsidian Compatible)
|
|
217
|
+
|
|
218
|
+
Every article is saved as **clean Markdown** with proper formatting:
|
|
219
|
+
|
|
220
|
+
```markdown
|
|
221
|
+
# Article Title
|
|
222
|
+
|
|
223
|
+
**Source**: Hacker News | **Date**: 2025-12-31 | **URL**: [Original Link]
|
|
224
|
+
|
|
225
|
+
## Content
|
|
226
|
+
|
|
227
|
+
Article body with images referenced locally...
|
|
228
|
+
|
|
229
|
+

|
|
230
|
+
```
|
|
231
|
+
|
|
232
|
+
**Perfect for Knowledge Management**:
|
|
233
|
+
- **Obsidian**: Drag folders directly into your vault for full-text search and backlinks
|
|
234
|
+
- **Notion**: Import markdown files while preserving structure
|
|
235
|
+
- **Logseq/Roam**: Compatible with daily notes and graph views
|
|
236
|
+
- **Standard Editors**: Works in VS Code, Typora, iA Writer, or any markdown editor
|
|
237
|
+
|
|
238
|
+
**Metadata Included**:
|
|
239
|
+
- Source attribution
|
|
240
|
+
- Publication date
|
|
241
|
+
- Original URLs
|
|
242
|
+
- Local image paths (relative linking)
|
|
243
|
+
|
|
244
|
+
## Output Structure
|
|
245
|
+
|
|
246
|
+
### Batch Mode (fetch/bundle)
|
|
247
|
+
```
|
|
248
|
+
../News/news_31-12-2025/
|
|
249
|
+
├── Hacker-News_31-12-2025/
|
|
250
|
+
│ ├── 01_Article_Title/
|
|
251
|
+
│ │ ├── article.md # Primary markdown file
|
|
252
|
+
│ │ ├── html/
|
|
253
|
+
│ │ │ └── article.html # Self-contained HTML with embedded CSS/JS
|
|
254
|
+
│ │ ├── images/
|
|
255
|
+
│ │ │ ├── content1.jpg
|
|
256
|
+
│ │ │ └── content2.png
|
|
257
|
+
│ │ └── comments.md # Discussions (HN, Reddit sources)
|
|
258
|
+
│ └── 02_Another_Article/
|
|
259
|
+
└── BBC_31-12-2025/
|
|
260
|
+
└── ...
|
|
261
|
+
```
|
|
262
|
+
|
|
263
|
+
### Single Article Mode
|
|
264
|
+
```
|
|
265
|
+
../Capcats/cc_31-12-2025-Article-Title/
|
|
266
|
+
├── article.md # Standalone markdown
|
|
267
|
+
├── html/
|
|
268
|
+
│ └── article.html # Complete standalone file
|
|
269
|
+
└── images/
|
|
270
|
+
└── ...
|
|
271
|
+
```
|
|
272
|
+
|
|
273
|
+
## Privacy & Ethics
|
|
274
|
+
|
|
275
|
+
**Privacy-First Design**:
|
|
276
|
+
- Usernames anonymized as "Anonymous" in comments
|
|
277
|
+
- Profile links preserved for reference
|
|
278
|
+
- No personal data collection or storage
|
|
279
|
+
- Only public content archived
|
|
280
|
+
|
|
281
|
+
**Ethical Scraping**:
|
|
282
|
+
- Respects robots.txt
|
|
283
|
+
- Rate limiting (1 request per 10 seconds)
|
|
284
|
+
- Prefers RSS/APIs over HTML scraping
|
|
285
|
+
- No paywall circumvention
|
|
286
|
+
- Proper source attribution
|
|
287
|
+
|
|
288
|
+
## Advanced Usage
|
|
289
|
+
|
|
290
|
+
### Add Custom Sources
|
|
291
|
+
|
|
292
|
+
```bash
|
|
293
|
+
# Interactive source addition
|
|
294
|
+
./capcat add-source --url https://example.com/rss
|
|
295
|
+
|
|
296
|
+
# Or edit configuration
|
|
297
|
+
nano sources/active/config_driven/configs/newsource.yaml
|
|
298
|
+
```
|
|
299
|
+
|
|
300
|
+
### Configuration Priority
|
|
301
|
+
|
|
302
|
+
1. CLI arguments → 2. Environment variables → 3. `capcat.yml` → 4. Defaults
|
|
303
|
+
|
|
304
|
+
Example `capcat.yml`:
|
|
305
|
+
```yaml
|
|
306
|
+
output_base_dir: "../MyNews"
|
|
307
|
+
max_workers: 8
|
|
308
|
+
download_media: true
|
|
309
|
+
```
|
|
310
|
+
|
|
311
|
+
### Automation
|
|
312
|
+
|
|
313
|
+
```bash
|
|
314
|
+
# Daily tech news cron job
|
|
315
|
+
0 9 * * * cd /path/to/capcat && ./capcat bundle tech --count 20
|
|
316
|
+
|
|
317
|
+
# Weekly science digest
|
|
318
|
+
0 10 * * 0 cd /path/to/capcat && ./capcat bundle science --count 30 --media
|
|
319
|
+
```
|
|
320
|
+
|
|
321
|
+
## Available Sources
|
|
322
|
+
|
|
323
|
+
**Tech**: Hacker News, Lobsters, InfoQ, IEEE Spectrum, Mashable
|
|
324
|
+
|
|
325
|
+
**AI**: Google Research, MIT News
|
|
326
|
+
|
|
327
|
+
**News**: BBC, The Guardian
|
|
328
|
+
|
|
329
|
+
**Science**: Nature, Scientific American
|
|
330
|
+
|
|
331
|
+
**Sports**: BBC Sport
|
|
332
|
+
|
|
333
|
+
**See all**: `./capcat list sources`
|
|
334
|
+
|
|
335
|
+
## Documentation
|
|
336
|
+
|
|
337
|
+
Full documentation at [capcat.org](https://capcat.org):
|
|
338
|
+
- [Quick Start Guide](https://capcat.org/docs/quick-start.html)
|
|
339
|
+
- [Architecture Overview](https://capcat.org/docs/architecture.html)
|
|
340
|
+
- [Source Development](https://capcat.org/docs/source-development.html)
|
|
341
|
+
- [Interactive Mode](https://capcat.org/docs/interactive-mode.html)
|
|
342
|
+
|
|
343
|
+
## Requirements
|
|
344
|
+
|
|
345
|
+
- Python 3.8+
|
|
346
|
+
- Internet connection
|
|
347
|
+
- ~50MB disk space for application
|
|
348
|
+
- Additional space for archived content
|
|
349
|
+
|
|
350
|
+
## Troubleshooting
|
|
351
|
+
|
|
352
|
+
**Dependencies issues?**
|
|
353
|
+
```bash
|
|
354
|
+
./scripts/fix_dependencies.sh --force
|
|
355
|
+
```
|
|
356
|
+
|
|
357
|
+
**Module not found?**
|
|
358
|
+
```bash
|
|
359
|
+
./capcat list sources # Wrapper handles venv activation
|
|
360
|
+
```
|
|
361
|
+
|
|
362
|
+
**Source failing?**
|
|
363
|
+
- Check `test-diagnose-*.md` reports
|
|
364
|
+
- Most sources use RSS/APIs for reliable, ethical access
|
|
365
|
+
- Run `./capcat catch` and try individual sources
|
|
366
|
+
|
|
367
|
+
## Contributing
|
|
368
|
+
|
|
369
|
+
Contributions welcome! Open an issue or pull request on [GitHub](https://github.com/stayukasabov/capcat).
|
|
370
|
+
|
|
371
|
+
## License
|
|
372
|
+
|
|
373
|
+
MIT License - See [LICENSE.txt](LICENSE.txt)
|
|
374
|
+
|
|
375
|
+
## Links
|
|
376
|
+
|
|
377
|
+
- **Website**: [capcat.org](https://capcat.org)
|
|
378
|
+
- **Repository**: [github.com/stayukasabov/capcat](https://github.com/stayukasabov/capcat)
|
|
379
|
+
- **Issues**: [github.com/stayukasabov/capcat/issues](https://github.com/stayukasabov/capcat/issues)
|
|
380
|
+
- **Case Study**: [stayux.substack.com](https://stayux.substack.com)
|
|
381
|
+
|
|
382
|
+
---
|
|
383
|
+
|
|
384
|
+
**Archive with confidence. Share without limits.**
|