capcat 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (193) hide show
  1. capcat-1.0.0/LICENSE.txt +42 -0
  2. capcat-1.0.0/PKG-INFO +384 -0
  3. capcat-1.0.0/README.md +303 -0
  4. capcat-1.0.0/capcat/__init__.py +4 -0
  5. capcat-1.0.0/capcat/__main__.py +4 -0
  6. capcat-1.0.0/capcat/cli.py +440 -0
  7. capcat-1.0.0/capcat/commands/__init__.py +0 -0
  8. capcat-1.0.0/capcat/commands/add_source.py +23 -0
  9. capcat-1.0.0/capcat/commands/fetch.py +73 -0
  10. capcat-1.0.0/capcat/commands/generate_config.py +32 -0
  11. capcat-1.0.0/capcat/commands/init.py +74 -0
  12. capcat-1.0.0/capcat/commands/remove_source.py +60 -0
  13. capcat-1.0.0/capcat/commands/single.py +219 -0
  14. capcat-1.0.0/capcat/core/__init__.py +0 -0
  15. capcat-1.0.0/capcat/core/article_fetcher.py +2720 -0
  16. capcat-1.0.0/capcat/core/circuit_breaker.py +479 -0
  17. capcat-1.0.0/capcat/core/cli_recovery.py +184 -0
  18. capcat-1.0.0/capcat/core/cli_validation.py +161 -0
  19. capcat-1.0.0/capcat/core/command_logging.py +157 -0
  20. capcat-1.0.0/capcat/core/config/__init__.py +216 -0
  21. capcat-1.0.0/capcat/core/config/source_base.py +321 -0
  22. capcat-1.0.0/capcat/core/config/source_registry.py +443 -0
  23. capcat-1.0.0/capcat/core/config.py +476 -0
  24. capcat-1.0.0/capcat/core/constants.py +58 -0
  25. capcat-1.0.0/capcat/core/conversion_executor.py +70 -0
  26. capcat-1.0.0/capcat/core/design_system_compiler.py +465 -0
  27. capcat-1.0.0/capcat/core/downloader.py +427 -0
  28. capcat-1.0.0/capcat/core/enhanced_argparse.py +113 -0
  29. capcat-1.0.0/capcat/core/error_handling.py +485 -0
  30. capcat-1.0.0/capcat/core/ethical_scraping.py +369 -0
  31. capcat-1.0.0/capcat/core/exceptions.py +185 -0
  32. capcat-1.0.0/capcat/core/formatter.py +1172 -0
  33. capcat-1.0.0/capcat/core/html_generator.py +1716 -0
  34. capcat-1.0.0/capcat/core/html_post_processor.py +661 -0
  35. capcat-1.0.0/capcat/core/image_processor.py +541 -0
  36. capcat-1.0.0/capcat/core/interactive.py +676 -0
  37. capcat-1.0.0/capcat/core/logging_config.py +203 -0
  38. capcat-1.0.0/capcat/core/media_config.py +289 -0
  39. capcat-1.0.0/capcat/core/media_executor.py +69 -0
  40. capcat-1.0.0/capcat/core/media_processor.py +906 -0
  41. capcat-1.0.0/capcat/core/network_resilience.py +569 -0
  42. capcat-1.0.0/capcat/core/news_source_adapter.py +1117 -0
  43. capcat-1.0.0/capcat/core/progress.py +1096 -0
  44. capcat-1.0.0/capcat/core/rate_limiter.py +281 -0
  45. capcat-1.0.0/capcat/core/retry.py +212 -0
  46. capcat-1.0.0/capcat/core/retry_skip.py +181 -0
  47. capcat-1.0.0/capcat/core/session_pool.py +186 -0
  48. capcat-1.0.0/capcat/core/shutdown.py +194 -0
  49. capcat-1.0.0/capcat/core/source_config.py +91 -0
  50. capcat-1.0.0/capcat/core/source_configs.py +133 -0
  51. capcat-1.0.0/capcat/core/source_factory.py +53 -0
  52. capcat-1.0.0/capcat/core/source_system/__init__.py +0 -0
  53. capcat-1.0.0/capcat/core/source_system/add_source_command.py +313 -0
  54. capcat-1.0.0/capcat/core/source_system/add_source_service.py +85 -0
  55. capcat-1.0.0/capcat/core/source_system/base_source.py +458 -0
  56. capcat-1.0.0/capcat/core/source_system/bundle_manager.py +446 -0
  57. capcat-1.0.0/capcat/core/source_system/bundle_models.py +61 -0
  58. capcat-1.0.0/capcat/core/source_system/bundle_service.py +472 -0
  59. capcat-1.0.0/capcat/core/source_system/bundle_ui.py +398 -0
  60. capcat-1.0.0/capcat/core/source_system/bundle_validator.py +220 -0
  61. capcat-1.0.0/capcat/core/source_system/config_driven_source.py +225 -0
  62. capcat-1.0.0/capcat/core/source_system/discovery_strategies.py +449 -0
  63. capcat-1.0.0/capcat/core/source_system/enhanced_remove_command.py +662 -0
  64. capcat-1.0.0/capcat/core/source_system/feed_discovery.py +186 -0
  65. capcat-1.0.0/capcat/core/source_system/feed_parser.py +293 -0
  66. capcat-1.0.0/capcat/core/source_system/performance_monitor.py +438 -0
  67. capcat-1.0.0/capcat/core/source_system/questionary_ui.py +221 -0
  68. capcat-1.0.0/capcat/core/source_system/removal_ui.py +166 -0
  69. capcat-1.0.0/capcat/core/source_system/remove_source_service.py +79 -0
  70. capcat-1.0.0/capcat/core/source_system/rss_feed_introspector.py +73 -0
  71. capcat-1.0.0/capcat/core/source_system/source_analytics.py +319 -0
  72. capcat-1.0.0/capcat/core/source_system/source_backup_manager.py +293 -0
  73. capcat-1.0.0/capcat/core/source_system/source_config.py +82 -0
  74. capcat-1.0.0/capcat/core/source_system/source_config_generator.py +66 -0
  75. capcat-1.0.0/capcat/core/source_system/source_factory.py +388 -0
  76. capcat-1.0.0/capcat/core/source_system/source_registry.py +488 -0
  77. capcat-1.0.0/capcat/core/source_system/validation_engine.py +888 -0
  78. capcat-1.0.0/capcat/core/specialized_source_manager.py +243 -0
  79. capcat-1.0.0/capcat/core/storage_manager.py +108 -0
  80. capcat-1.0.0/capcat/core/streamlined_comment_processor.py +361 -0
  81. capcat-1.0.0/capcat/core/template_renderer.py +158 -0
  82. capcat-1.0.0/capcat/core/theme_utils.py +78 -0
  83. capcat-1.0.0/capcat/core/timeout_config.py +332 -0
  84. capcat-1.0.0/capcat/core/timeout_wrapper.py +87 -0
  85. capcat-1.0.0/capcat/core/unified_article_processor.py +347 -0
  86. capcat-1.0.0/capcat/core/unified_media_processor.py +133 -0
  87. capcat-1.0.0/capcat/core/unified_source_processor.py +807 -0
  88. capcat-1.0.0/capcat/core/update_manager.py +420 -0
  89. capcat-1.0.0/capcat/core/url_utils.py +241 -0
  90. capcat-1.0.0/capcat/core/utils.py +277 -0
  91. capcat-1.0.0/capcat/htmlgen/__init__.py +88 -0
  92. capcat-1.0.0/capcat/htmlgen/base/base_generator.py +574 -0
  93. capcat-1.0.0/capcat/htmlgen/hn/generator.py +390 -0
  94. capcat-1.0.0/capcat/htmlgen/lb/generator.py +446 -0
  95. capcat-1.0.0/capcat/htmlgen/lesswrong/generator.py +426 -0
  96. capcat-1.0.0/capcat/sources/__init__.py +0 -0
  97. capcat-1.0.0/capcat/sources/base/__init__.py +7 -0
  98. capcat-1.0.0/capcat/sources/base/config_schema.py +168 -0
  99. capcat-1.0.0/capcat/sources/base/factory.py +103 -0
  100. capcat-1.0.0/capcat/sources/builtin/__init__.py +0 -0
  101. capcat-1.0.0/capcat/sources/builtin/bundles.yml +48 -0
  102. capcat-1.0.0/capcat/sources/builtin/business_sources.yml +3 -0
  103. capcat-1.0.0/capcat/sources/builtin/config_driven/configs/bbc.yaml +63 -0
  104. capcat-1.0.0/capcat/sources/builtin/config_driven/configs/bbcsport.yaml +70 -0
  105. capcat-1.0.0/capcat/sources/builtin/config_driven/configs/google-reserch.yml +12 -0
  106. capcat-1.0.0/capcat/sources/builtin/config_driven/configs/guardian.yaml +95 -0
  107. capcat-1.0.0/capcat/sources/builtin/config_driven/configs/ieee.yaml +73 -0
  108. capcat-1.0.0/capcat/sources/builtin/config_driven/configs/iq.yaml +67 -0
  109. capcat-1.0.0/capcat/sources/builtin/config_driven/configs/mashable.yml +12 -0
  110. capcat-1.0.0/capcat/sources/builtin/config_driven/configs/mitnews.yaml +89 -0
  111. capcat-1.0.0/capcat/sources/builtin/config_driven/configs/nature.yaml +69 -0
  112. capcat-1.0.0/capcat/sources/builtin/config_driven/configs/scientificamerican.yaml +67 -0
  113. capcat-1.0.0/capcat/sources/builtin/custom/hn/config.yaml +21 -0
  114. capcat-1.0.0/capcat/sources/builtin/custom/hn/source.py +387 -0
  115. capcat-1.0.0/capcat/sources/builtin/custom/lb/config.yaml +26 -0
  116. capcat-1.0.0/capcat/sources/builtin/custom/lb/source.py +702 -0
  117. capcat-1.0.0/capcat/sources/builtin/news_sources.yml +13 -0
  118. capcat-1.0.0/capcat/sources/builtin/science_sources.yml +20 -0
  119. capcat-1.0.0/capcat/sources/builtin/tech_sources.yml +55 -0
  120. capcat-1.0.0/capcat/sources/specialized/__init__.py +50 -0
  121. capcat-1.0.0/capcat/sources/specialized/medium/source.py +623 -0
  122. capcat-1.0.0/capcat/sources/specialized/substack/source.py +711 -0
  123. capcat-1.0.0/capcat/sources/specialized/twitter/__init__.py +4 -0
  124. capcat-1.0.0/capcat/sources/specialized/twitter/source.py +84 -0
  125. capcat-1.0.0/capcat/sources/specialized/vimeo/__init__.py +4 -0
  126. capcat-1.0.0/capcat/sources/specialized/vimeo/source.py +164 -0
  127. capcat-1.0.0/capcat/sources/specialized/youtube/__init__.py +4 -0
  128. capcat-1.0.0/capcat/sources/specialized/youtube/source.py +132 -0
  129. capcat-1.0.0/capcat/themes/DS-logic.md +75 -0
  130. capcat-1.0.0/capcat/themes/Space-Grotesk/SpaceGrotesk-Bold.woff +0 -0
  131. capcat-1.0.0/capcat/themes/Space-Grotesk/SpaceGrotesk-Bold.woff2 +0 -0
  132. capcat-1.0.0/capcat/themes/Space-Grotesk/SpaceGrotesk-Light.woff +0 -0
  133. capcat-1.0.0/capcat/themes/Space-Grotesk/SpaceGrotesk-Light.woff2 +0 -0
  134. capcat-1.0.0/capcat/themes/Space-Grotesk/SpaceGrotesk-Medium.woff +0 -0
  135. capcat-1.0.0/capcat/themes/Space-Grotesk/SpaceGrotesk-Medium.woff2 +0 -0
  136. capcat-1.0.0/capcat/themes/Space-Grotesk/SpaceGrotesk-Regular.woff +0 -0
  137. capcat-1.0.0/capcat/themes/Space-Grotesk/SpaceGrotesk-Regular.woff2 +0 -0
  138. capcat-1.0.0/capcat/themes/Space-Grotesk/SpaceGrotesk-SemiBold.woff +0 -0
  139. capcat-1.0.0/capcat/themes/Space-Grotesk/SpaceGrotesk-SemiBold.woff2 +0 -0
  140. capcat-1.0.0/capcat/themes/base.css +1529 -0
  141. capcat-1.0.0/capcat/themes/design-system.css +518 -0
  142. capcat-1.0.0/capcat/themes/js/capcat.js +273 -0
  143. capcat-1.0.0/capcat/themes/test new orange +2 -0
  144. capcat-1.0.0/capcat/tui.py +8 -0
  145. capcat-1.0.0/capcat.egg-info/PKG-INFO +384 -0
  146. capcat-1.0.0/capcat.egg-info/SOURCES.txt +191 -0
  147. capcat-1.0.0/capcat.egg-info/dependency_links.txt +1 -0
  148. capcat-1.0.0/capcat.egg-info/entry_points.txt +2 -0
  149. capcat-1.0.0/capcat.egg-info/requires.txt +25 -0
  150. capcat-1.0.0/capcat.egg-info/top_level.txt +1 -0
  151. capcat-1.0.0/pyproject.toml +180 -0
  152. capcat-1.0.0/setup.cfg +4 -0
  153. capcat-1.0.0/tests/test_add_source_command_refactored.py +545 -0
  154. capcat-1.0.0/tests/test_add_source_service.py +188 -0
  155. capcat-1.0.0/tests/test_batch_specialized_integration.py +398 -0
  156. capcat-1.0.0/tests/test_bundle_manager.py +67 -0
  157. capcat-1.0.0/tests/test_bundle_manager_remove.py +150 -0
  158. capcat-1.0.0/tests/test_bundle_validator.py +566 -0
  159. capcat-1.0.0/tests/test_cleanup.py +249 -0
  160. capcat-1.0.0/tests/test_cli_add_source.py +88 -0
  161. capcat-1.0.0/tests/test_discovery_strategies.py +292 -0
  162. capcat-1.0.0/tests/test_error_handling.py +476 -0
  163. capcat-1.0.0/tests/test_ethical_scraping.py +61 -0
  164. capcat-1.0.0/tests/test_feed_date_sorting.py +110 -0
  165. capcat-1.0.0/tests/test_feed_parser.py +232 -0
  166. capcat-1.0.0/tests/test_fetch_arguments.py +337 -0
  167. capcat-1.0.0/tests/test_fetch_command.py +43 -0
  168. capcat-1.0.0/tests/test_formatter_none_handling.py +83 -0
  169. capcat-1.0.0/tests/test_help_examples.py +394 -0
  170. capcat-1.0.0/tests/test_hn_pagination.py +369 -0
  171. capcat-1.0.0/tests/test_index_filename_detection.py +138 -0
  172. capcat-1.0.0/tests/test_interactive.py +67 -0
  173. capcat-1.0.0/tests/test_list_command.py +335 -0
  174. capcat-1.0.0/tests/test_lobsters_pagination.py +395 -0
  175. capcat-1.0.0/tests/test_media_processor_step1.py +297 -0
  176. capcat-1.0.0/tests/test_media_processor_step2.py +228 -0
  177. capcat-1.0.0/tests/test_pdf_handling.py +201 -0
  178. capcat-1.0.0/tests/test_pdf_skip_prompt.py +466 -0
  179. capcat-1.0.0/tests/test_pep8_refactoring.py +337 -0
  180. capcat-1.0.0/tests/test_remove_source_command.py +410 -0
  181. capcat-1.0.0/tests/test_retry_skip_logic.py +464 -0
  182. capcat-1.0.0/tests/test_rss_feed_introspector.py +96 -0
  183. capcat-1.0.0/tests/test_single_command.py +47 -0
  184. capcat-1.0.0/tests/test_source_analytics.py +273 -0
  185. capcat-1.0.0/tests/test_source_backup_manager.py +202 -0
  186. capcat-1.0.0/tests/test_source_config_generator.py +62 -0
  187. capcat-1.0.0/tests/test_source_management_menu.py +503 -0
  188. capcat-1.0.0/tests/test_theme_hash_persistence.py +118 -0
  189. capcat-1.0.0/tests/test_thread_safe_timeout.py +271 -0
  190. capcat-1.0.0/tests/test_title_truncation.py +115 -0
  191. capcat-1.0.0/tests/test_unified_article_processor.py +269 -0
  192. capcat-1.0.0/tests/test_url_replacement.py +379 -0
  193. capcat-1.0.0/tests/test_video_sources.py +138 -0
@@ -0,0 +1,42 @@
1
+ MIT-Style Non-Commercial License
2
+
3
+ Copyright (c) 2025 Stayu Kasabov
4
+
5
+ Original Product: Capcat - News Article Archiving System
6
+ Author: Stayu Kasabov | https://stayux.com
7
+ Product Designer with Holistic Production Expertise
8
+
9
+ Permission is hereby granted, free of charge, to any person obtaining a copy
10
+ of this software and associated documentation files (the "Software"), to deal
11
+ in the Software without restriction for NON-COMMERCIAL PURPOSES ONLY,
12
+ including without limitation the rights to use, copy, modify, merge, publish,
13
+ distribute, sublicense, and to permit persons to whom the Software is
14
+ furnished to do so, subject to the following conditions:
15
+
16
+ 1. NON-COMMERCIAL USE ONLY: This software may not be used for commercial
17
+ purposes. Commercial purposes include, but are not limited to: selling
18
+ the software, using it in a commercial product or service, or using it
19
+ to generate revenue.
20
+
21
+ 2. ATTRIBUTION: The above copyright notice and this permission notice shall
22
+ be included in all copies or substantial portions of the Software. Credit
23
+ must be given to the original author: Stayu Kasabov (https://stayux.com)
24
+
25
+ 3. SHARE ALIKE: Any modifications or derivative works must be released under
26
+ the same non-commercial terms.
27
+
28
+ 4. CONTRIBUTIONS WELCOME: Users are encouraged to contribute improvements
29
+ back to the original project.
30
+
31
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
32
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
33
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
34
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
35
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
36
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
37
+ SOFTWARE.
38
+
39
+ ---
40
+
41
+ A free and open-source tool to make people's lives easier.
42
+ Contributions welcome! Contact: Stayu Kasabov | https://stayux.com
capcat-1.0.0/PKG-INFO ADDED
@@ -0,0 +1,384 @@
1
+ Metadata-Version: 2.4
2
+ Name: capcat
3
+ Version: 1.0.0
4
+ Summary: A command-line tool designed to solve content preservation challenges with Ethical Scraping.
5
+ Author: Stayu Kasabov - Product Designer and Experiences Builder | AI-powered Prototyping & MVP | Strategic Generalist
6
+ License: MIT-Style Non-Commercial License
7
+
8
+ Copyright (c) 2025 Stayu Kasabov
9
+
10
+ Original Product: Capcat - News Article Archiving System
11
+ Author: Stayu Kasabov | https://stayux.com
12
+ Product Designer with Holistic Production Expertise
13
+
14
+ Permission is hereby granted, free of charge, to any person obtaining a copy
15
+ of this software and associated documentation files (the "Software"), to deal
16
+ in the Software without restriction for NON-COMMERCIAL PURPOSES ONLY,
17
+ including without limitation the rights to use, copy, modify, merge, publish,
18
+ distribute, sublicense, and to permit persons to whom the Software is
19
+ furnished to do so, subject to the following conditions:
20
+
21
+ 1. NON-COMMERCIAL USE ONLY: This software may not be used for commercial
22
+ purposes. Commercial purposes include, but are not limited to: selling
23
+ the software, using it in a commercial product or service, or using it
24
+ to generate revenue.
25
+
26
+ 2. ATTRIBUTION: The above copyright notice and this permission notice shall
27
+ be included in all copies or substantial portions of the Software. Credit
28
+ must be given to the original author: Stayu Kasabov (https://stayux.com)
29
+
30
+ 3. SHARE ALIKE: Any modifications or derivative works must be released under
31
+ the same non-commercial terms.
32
+
33
+ 4. CONTRIBUTIONS WELCOME: Users are encouraged to contribute improvements
34
+ back to the original project.
35
+
36
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
37
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
38
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
39
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
40
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
41
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
42
+ SOFTWARE.
43
+
44
+ ---
45
+
46
+ A free and open-source tool to make people's lives easier.
47
+ Contributions welcome! Contact: Stayu Kasabov | https://stayux.com
48
+ Project-URL: Homepage, https://github.com/<owner>/capcat
49
+ Classifier: Programming Language :: Python :: 3
50
+ Classifier: License :: OSI Approved :: MIT License
51
+ Classifier: Operating System :: OS Independent
52
+ Classifier: Environment :: Console
53
+ Requires-Python: >=3.8
54
+ Description-Content-Type: text/markdown
55
+ License-File: LICENSE.txt
56
+ Requires-Dist: requests<3.0.0,>=2.28.0
57
+ Requires-Dist: beautifulsoup4<5.0.0,>=4.12.0
58
+ Requires-Dist: PyYAML<7.0,>=6.0
59
+ Requires-Dist: feedparser<7.0,>=6.0
60
+ Requires-Dist: questionary<3.0,>=2.0
61
+ Requires-Dist: markdownify<1.0,>=0.11
62
+ Requires-Dist: lxml<6.0,>=4.9
63
+ Requires-Dist: ruamel.yaml<0.19,>=0.17
64
+ Requires-Dist: validators<1.0,>=0.20
65
+ Requires-Dist: prompt_toolkit<4.0,>=3.0
66
+ Requires-Dist: yt-dlp<2027.0.0,>=2023.1.6
67
+ Requires-Dist: markdown<4.0,>=3.5
68
+ Requires-Dist: pygments<3.0,>=2.16
69
+ Requires-Dist: charset-normalizer<4.0,>=3.0
70
+ Requires-Dist: brotli<2.0,>=1.0
71
+ Requires-Dist: pynput<2.0,>=1.7.6
72
+ Requires-Dist: rich<15.0,>=13.0
73
+ Provides-Extra: dev
74
+ Requires-Dist: pytest>=7.0; extra == "dev"
75
+ Requires-Dist: pytest-cov>=4.0; extra == "dev"
76
+ Requires-Dist: mypy>=1.0; extra == "dev"
77
+ Requires-Dist: ruff>=0.1; extra == "dev"
78
+ Requires-Dist: build>=1.0; extra == "dev"
79
+ Requires-Dist: twine>=5.0; extra == "dev"
80
+ Dynamic: license-file
81
+
82
+ # Capcat - Archive and Share Articles with Confidence
83
+
84
+ A dual-mode news archiving tool that captures articles from 12 curated sources as **clean Markdown files** (Obsidian-ready) with optional **self-contained HTML** output - perfect for knowledge management and offline sharing.
85
+
86
+ ## Why Capcat?
87
+
88
+ **Build Your Knowledge Base**: Every article saved as clean Markdown - drop directly into Obsidian for full-text search, backlinks, and graph views. Perfect for researchers and lifelong learners.
89
+
90
+ **Share Without Breaking**: Optional self-contained HTML output with all styles and scripts embedded. Send to anyone, open anywhere, years later - it just works.
91
+
92
+ **Two Ways to Use**:
93
+ - **Interactive Menu** (`./capcat catch`) - Visual interface for browsing sources and bundles
94
+ - **Command Line** - Fast automation for power users
95
+
96
+ **Curated Bundles**: Pre-configured collections like Tech, AI, Science, News - fetch multiple related sources at once.
97
+
98
+ ## Quick Start
99
+
100
+ ### Interactive Mode (Recommended)
101
+
102
+ ```bash
103
+ ./capcat catch
104
+ ```
105
+
106
+ Choose from:
107
+ - **Fetch by Source** - Browse 12 curated sources (Hacker News, BBC, IEEE, Nature, etc.)
108
+ - **Fetch by Bundle** - Curated collections (Tech, AI, Science, News, Sports)
109
+ - **Single Article** - Archive any URL instantly
110
+ - **Source Management** - Add custom RSS/news sources
111
+
112
+ ### Command Line Mode
113
+
114
+ ```bash
115
+ # Fetch curated tech bundle (IEEE + Mashable)
116
+ ./capcat bundle tech --count 10
117
+
118
+ # Fetch specific sources with media
119
+ ./capcat fetch hn,bbc --count 15 --media
120
+
121
+ # Archive a single article
122
+ ./capcat single https://example.com/article
123
+
124
+ # List all available sources
125
+ ./capcat list sources
126
+ ```
127
+
128
+ ## Key Features
129
+
130
+ ### Self-Contained HTML for Easy Sharing
131
+
132
+ Every article is a **complete, portable HTML file**:
133
+ - **Embedded CSS** - All styles inline, no external stylesheets
134
+ - **Embedded JavaScript** - Interactive features work offline
135
+ - **Local Images** - Downloaded and stored with the article
136
+ - **No Dependencies** - Open in any browser, share via email, archive forever
137
+
138
+ **Perfect for**:
139
+ - Email attachments that always look right
140
+ - Long-term archiving without link rot
141
+ - Offline reading on any device
142
+ - Sharing articles that might disappear
143
+
144
+ ### Dual Interface
145
+
146
+ **Interactive Menu** (`./capcat catch`):
147
+ - Visual source selection
148
+ - Bundle browsing
149
+ - Progress tracking
150
+ - Error handling with retries
151
+ - No commands to memorize
152
+
153
+ **Command Line**:
154
+ - Fast automation and scripting
155
+ - Batch processing
156
+ - CI/CD integration
157
+ - Power user workflows
158
+
159
+ ### Smart Content Extraction
160
+
161
+ - **12 Curated Sources** - HN, BBC, Guardian, Nature, IEEE, Scientific American, MIT News, and more
162
+ - **Intelligent Fallback** - Finds images even when primary extraction misses them
163
+ - **Comment Preservation** - Captures discussions with privacy anonymization
164
+ - **Media Handling** - Images always downloaded, video/audio/PDFs with `--media` flag
165
+
166
+ ### Markdown-Native Output
167
+
168
+ - **Obsidian-Ready** - Clean markdown files you can drop directly into your vault
169
+ - **Portable Archives** - Standard markdown format works everywhere
170
+ - **Local Images** - All media downloaded and referenced with relative paths
171
+ - **Metadata Headers** - Source, date, and URL preserved in frontmatter-style headers
172
+
173
+ ### Bundle System
174
+
175
+ Pre-configured topic collections:
176
+
177
+ | Bundle | Sources | Description |
178
+ |--------|---------|-------------|
179
+ | `tech` | IEEE, Mashable | Consumer technology news |
180
+ | `techpro` | HN, Lobsters, InfoQ | Professional developer news |
181
+ | `ai` | MIT News, Google Research | AI research and developments |
182
+ | `science` | Nature, Scientific American | Scientific publications |
183
+ | `news` | BBC, Guardian | General news |
184
+ | `sports` | BBC Sport | Sports coverage |
185
+
186
+ Add your own bundles in `sources/active/bundles.yml`.
187
+
188
+ ## Installation
189
+
190
+ ### Quick Setup
191
+
192
+ ```bash
193
+ # Clone the repository
194
+ git clone https://github.com/stayukasabov/capcat.git
195
+ cd capcat/Application
196
+
197
+ # Auto-fix dependencies (recommended)
198
+ ./scripts/fix_dependencies.sh
199
+
200
+ # Or manual setup
201
+ python3 -m venv venv
202
+ source venv/bin/activate
203
+ pip install -r requirements.txt
204
+ ```
205
+
206
+ ### First Run
207
+
208
+ ```bash
209
+ # Launch interactive menu
210
+ ./capcat catch
211
+
212
+ # Or try a quick fetch
213
+ ./capcat fetch hn --count 5
214
+ ```
215
+
216
+ ## Markdown-First Workflow (Obsidian Compatible)
217
+
218
+ Every article is saved as **clean Markdown** with proper formatting:
219
+
220
+ ```markdown
221
+ # Article Title
222
+
223
+ **Source**: Hacker News | **Date**: 2025-12-31 | **URL**: [Original Link]
224
+
225
+ ## Content
226
+
227
+ Article body with images referenced locally...
228
+
229
+ ![Image Description](../images/image.jpg)
230
+ ```
231
+
232
+ **Perfect for Knowledge Management**:
233
+ - **Obsidian**: Drag folders directly into your vault for full-text search and backlinks
234
+ - **Notion**: Import markdown files while preserving structure
235
+ - **Logseq/Roam**: Compatible with daily notes and graph views
236
+ - **Standard Editors**: Works in VS Code, Typora, iA Writer, or any markdown editor
237
+
238
+ **Metadata Included**:
239
+ - Source attribution
240
+ - Publication date
241
+ - Original URLs
242
+ - Local image paths (relative linking)
243
+
244
+ ## Output Structure
245
+
246
+ ### Batch Mode (fetch/bundle)
247
+ ```
248
+ ../News/news_31-12-2025/
249
+ ├── Hacker-News_31-12-2025/
250
+ │ ├── 01_Article_Title/
251
+ │ │ ├── article.md # Primary markdown file
252
+ │ │ ├── html/
253
+ │ │ │ └── article.html # Self-contained HTML with embedded CSS/JS
254
+ │ │ ├── images/
255
+ │ │ │ ├── content1.jpg
256
+ │ │ │ └── content2.png
257
+ │ │ └── comments.md # Discussions (HN, Reddit sources)
258
+ │ └── 02_Another_Article/
259
+ └── BBC_31-12-2025/
260
+ └── ...
261
+ ```
262
+
263
+ ### Single Article Mode
264
+ ```
265
+ ../Capcats/cc_31-12-2025-Article-Title/
266
+ ├── article.md # Standalone markdown
267
+ ├── html/
268
+ │ └── article.html # Complete standalone file
269
+ └── images/
270
+ └── ...
271
+ ```
272
+
273
+ ## Privacy & Ethics
274
+
275
+ **Privacy-First Design**:
276
+ - Usernames anonymized as "Anonymous" in comments
277
+ - Profile links preserved for reference
278
+ - No personal data collection or storage
279
+ - Only public content archived
280
+
281
+ **Ethical Scraping**:
282
+ - Respects robots.txt
283
+ - Rate limiting (1 request per 10 seconds)
284
+ - Prefers RSS/APIs over HTML scraping
285
+ - No paywall circumvention
286
+ - Proper source attribution
287
+
288
+ ## Advanced Usage
289
+
290
+ ### Add Custom Sources
291
+
292
+ ```bash
293
+ # Interactive source addition
294
+ ./capcat add-source --url https://example.com/rss
295
+
296
+ # Or edit configuration
297
+ nano sources/active/config_driven/configs/newsource.yaml
298
+ ```
299
+
300
+ ### Configuration Priority
301
+
302
+ 1. CLI arguments → 2. Environment variables → 3. `capcat.yml` → 4. Defaults
303
+
304
+ Example `capcat.yml`:
305
+ ```yaml
306
+ output_base_dir: "../MyNews"
307
+ max_workers: 8
308
+ download_media: true
309
+ ```
310
+
311
+ ### Automation
312
+
313
+ ```bash
314
+ # Daily tech news cron job
315
+ 0 9 * * * cd /path/to/capcat && ./capcat bundle tech --count 20
316
+
317
+ # Weekly science digest
318
+ 0 10 * * 0 cd /path/to/capcat && ./capcat bundle science --count 30 --media
319
+ ```
320
+
321
+ ## Available Sources
322
+
323
+ **Tech**: Hacker News, Lobsters, InfoQ, IEEE Spectrum, Mashable
324
+
325
+ **AI**: Google Research, MIT News
326
+
327
+ **News**: BBC, The Guardian
328
+
329
+ **Science**: Nature, Scientific American
330
+
331
+ **Sports**: BBC Sport
332
+
333
+ **See all**: `./capcat list sources`
334
+
335
+ ## Documentation
336
+
337
+ Full documentation at [capcat.org](https://capcat.org):
338
+ - [Quick Start Guide](https://capcat.org/docs/quick-start.html)
339
+ - [Architecture Overview](https://capcat.org/docs/architecture.html)
340
+ - [Source Development](https://capcat.org/docs/source-development.html)
341
+ - [Interactive Mode](https://capcat.org/docs/interactive-mode.html)
342
+
343
+ ## Requirements
344
+
345
+ - Python 3.8+
346
+ - Internet connection
347
+ - ~50MB disk space for application
348
+ - Additional space for archived content
349
+
350
+ ## Troubleshooting
351
+
352
+ **Dependencies issues?**
353
+ ```bash
354
+ ./scripts/fix_dependencies.sh --force
355
+ ```
356
+
357
+ **Module not found?**
358
+ ```bash
359
+ ./capcat list sources # Wrapper handles venv activation
360
+ ```
361
+
362
+ **Source failing?**
363
+ - Check `test-diagnose-*.md` reports
364
+ - Most sources use RSS/APIs for reliable, ethical access
365
+ - Run `./capcat catch` and try individual sources
366
+
367
+ ## Contributing
368
+
369
+ Contributions welcome! Open an issue or pull request on [GitHub](https://github.com/stayukasabov/capcat).
370
+
371
+ ## License
372
+
373
+ MIT License - See [LICENSE.txt](LICENSE.txt)
374
+
375
+ ## Links
376
+
377
+ - **Website**: [capcat.org](https://capcat.org)
378
+ - **Repository**: [github.com/stayukasabov/capcat](https://github.com/stayukasabov/capcat)
379
+ - **Issues**: [github.com/stayukasabov/capcat/issues](https://github.com/stayukasabov/capcat/issues)
380
+ - **Case Study**: [stayux.substack.com](https://stayux.substack.com)
381
+
382
+ ---
383
+
384
+ **Archive with confidence. Share without limits.**