PyPI - notegen - Versions diffs - 1.0.2__tar.gz → 2.0.0__tar.gz - Mend

notegen 1.0.2tar.gz → 2.0.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

{notegen-1.0.2 → notegen-2.0.0}/PKG-INFO +116 -11
notegen-2.0.0/README.md +261 -0
notegen-2.0.0/notes_gen/cache.py +63 -0
notegen-2.0.0/notes_gen/cli.py +525 -0
{notegen-1.0.2 → notegen-2.0.0}/notes_gen/config.py +34 -2
notegen-2.0.0/notes_gen/output/formats.py +73 -0
notegen-2.0.0/notes_gen/processing/dry_run.py +101 -0
notegen-2.0.0/notes_gen/processing/llm.py +229 -0
{notegen-1.0.2 → notegen-2.0.0}/notes_gen/processing/merger.py +25 -2
{notegen-1.0.2 → notegen-2.0.0}/notes_gen/sources/text.py +35 -4
notegen-2.0.0/notes_gen/sources/watch.py +83 -0
notegen-2.0.0/notes_gen/sources/web.py +246 -0
notegen-2.0.0/notes_gen/sources/youtube.py +276 -0
{notegen-1.0.2 → notegen-2.0.0}/pyproject.toml +2 -1
notegen-2.0.0/tests/fixtures/sample_html.html +65 -0
notegen-2.0.0/tests/test_cache.py +85 -0
notegen-2.0.0/tests/test_cli.py +310 -0
{notegen-1.0.2 → notegen-2.0.0}/tests/test_config.py +27 -0
notegen-2.0.0/tests/test_formats.py +65 -0
{notegen-1.0.2 → notegen-2.0.0}/tests/test_llm.py +128 -9
notegen-2.0.0/tests/test_merger.py +115 -0
{notegen-1.0.2 → notegen-2.0.0}/tests/test_text.py +87 -87
notegen-2.0.0/tests/test_watch.py +91 -0
{notegen-1.0.2 → notegen-2.0.0}/tests/test_web.py +178 -153
{notegen-1.0.2 → notegen-2.0.0}/tests/test_youtube.py +251 -197
notegen-1.0.2/README.md +0 -157
notegen-1.0.2/notes_gen/cli.py +0 -199
notegen-1.0.2/notes_gen/processing/llm.py +0 -128
notegen-1.0.2/notes_gen/sources/web.py +0 -165
notegen-1.0.2/notes_gen/sources/youtube.py +0 -162
notegen-1.0.2/tests/fixtures/sample_html.html +0 -30
notegen-1.0.2/tests/test_cli.py +0 -139
notegen-1.0.2/tests/test_merger.py +0 -52
{notegen-1.0.2 → notegen-2.0.0}/.gitignore +0 -0
{notegen-1.0.2 → notegen-2.0.0}/notes_gen/__init__.py +0 -0
{notegen-1.0.2 → notegen-2.0.0}/notes_gen/output/__init__.py +0 -0
{notegen-1.0.2 → notegen-2.0.0}/notes_gen/output/formatter.py +0 -0
{notegen-1.0.2 → notegen-2.0.0}/notes_gen/output/writer.py +0 -0
{notegen-1.0.2 → notegen-2.0.0}/notes_gen/processing/__init__.py +0 -0
{notegen-1.0.2 → notegen-2.0.0}/notes_gen/processing/chunker.py +0 -0
{notegen-1.0.2 → notegen-2.0.0}/notes_gen/processing/filter.py +0 -0
{notegen-1.0.2 → notegen-2.0.0}/notes_gen/sources/__init__.py +0 -0
{notegen-1.0.2 → notegen-2.0.0}/tests/__init__.py +0 -0
{notegen-1.0.2 → notegen-2.0.0}/tests/fixtures/.gitkeep +0 -0
{notegen-1.0.2 → notegen-2.0.0}/tests/fixtures/sample_transcript.txt +0 -0
{notegen-1.0.2 → notegen-2.0.0}/tests/test_chunker.py +0 -0
{notegen-1.0.2 → notegen-2.0.0}/tests/test_filter.py +0 -0
{notegen-1.0.2 → notegen-2.0.0}/tests/test_formatter.py +0 -0
{notegen-1.0.2 → notegen-2.0.0}/tests/test_writer.py +0 -0

{notegen-1.0.2 → notegen-2.0.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: notegen
-Version: 1.0.2
+Version: 2.0.0
 Summary: Convert YouTube videos, playlists, and web pages into Obsidian markdown notes using LLMs
 Project-URL: Homepage, https://github.com/moneytosms/notegen
 Project-URL: Bug Tracker, https://github.com/moneytosms/notegen/issues
@@ -26,12 +26,15 @@ Requires-Dist: rich>=13
 Requires-Dist: tiktoken>=0.7
 Requires-Dist: trafilatura>=1.12
 Requires-Dist: typer>=0.12
+Requires-Dist: watchfiles>=1.2.0
 Requires-Dist: youtube-transcript-api>=0.6
 Requires-Dist: yt-dlp>=2024.1
 Description-Content-Type: text/markdown
 # notegen
+[![CI](https://github.com/moneytosms/notegen/actions/workflows/ci.yml/badge.svg)](https://github.com/moneytosms/notegen/actions/workflows/ci.yml)
 Convert YouTube videos, playlists, and web pages into structured Obsidian-flavored markdown notes using LLMs.
 ## Install
@@ -49,7 +52,10 @@ notegen config init
 # 2. Open config and add your API key
 notegen config open
-# 3. Generate notes
+# 3. Verify everything works
+notegen doctor
+# 4. Generate notes
 notegen https://youtube.com/watch?v=...
 ```
@@ -64,25 +70,46 @@ notegen transcript.txt
 # Explicit commands
 notegen video <youtube-url>
-notegen playlist <playlist-url> [--force]
+notegen playlist <playlist-url> [--force] [--force-restart]
 notegen web <url>
 notegen text <file-or-stdin>
 notegen text -                          # stdin
+# Watch a folder — auto-process new .txt/.md files
+notegen watch ./inbox
+# Dry run — estimate tokens/cost without calling LLM
+notegen -n https://youtube.com/watch?v=...
+notegen video <url> --dry-run
+# Output format
+notegen video <url> --format logseq
+notegen web <url> --format plain
 # Config
-notegen config init   # create config file
-notegen config open   # open config in your default editor
-notegen config show   # print resolved config
+notegen config init      # create config file
+notegen config open      # open config in your default editor
+notegen config show      # print resolved config
+notegen config validate  # check structure + API key presence
+notegen doctor           # config check + real test API call
+# Cache
+notegen cache clear      # remove ~/.cache/notegen/
 ```
 ## Options
 | Flag | Description |
 |---|---|
-| `-o / --output-dir` | Override output directory |
-| `-m / --model` | LiteLLM model string (e.g. `groq/llama-3.3-70b-versatile`) |
+| `-o / --output-dir PATH` | Override output directory |
+| `-m / --model TEXT` | LiteLLM model string (e.g. `groq/llama-3.3-70b-versatile`) |
+| `-v / --verbose` | Show chunk count, token usage, model/key selection, crawl status |
 | `--no-mermaid` | Disable mermaid diagram generation |
+| `--no-cache` | Skip cache read/write for this run |
+| `-n / --dry-run` | Print token/cost estimate; skip LLM call |
+| `--format TEXT` | Output format: `obsidian` (default) · `logseq` · `plain` · `roam` |
 | `--force` | Skip playlist videos without captions instead of aborting |
+| `--force-restart` | Ignore playlist resume file, reprocess all videos |
 ## Config file
@@ -106,6 +133,19 @@ model: anthropic/claude-sonnet-4-6
 output_dir: ~/notes
 mermaid: true
+# Output format: obsidian (default) | logseq | plain | roam
+output_format: obsidian
+# Caching — transcripts + LLM output cached in ~/.cache/notegen/
+# Set to false to always re-fetch and re-generate
+cache: true
+# Token budget — compress output if it exceeds this many tokens (0 = no limit)
+max_output_tokens: 0
+# Fuzzy dedup — skip near-duplicate sections in merged notes (Jaccard threshold)
+merger_similarity_threshold: 0.7
 # API key rotation — add multiple keys per provider.
 # notegen picks one at random each request (useful for free-tier rate limits).
 api_keys:
@@ -139,8 +179,6 @@ web_max_pages: 50
 web_max_depth: 3
 # Rate limiting & retry (important for free-tier providers like Groq, Gemini)
-# On a 429 error: cools down the offending key, rotates to another if available,
-# otherwise waits using Retry-After header or exponential backoff.
 max_retries: 5
 retry_base_delay: 60.0   # seconds; backoff = base * 2^attempt
 ```
@@ -164,6 +202,72 @@ retry_base_delay: 60.0   # seconds; backoff = base * 2^attempt
 Any provider supported by [LiteLLM](https://docs.litellm.ai/docs/providers) works.
+## Env var API keys
+As an alternative to the config file, set `NOTEGEN_<PROVIDER>_KEY` env vars. These are used as fallback when no keys are configured for a provider:
+```bash
+export NOTEGEN_GROQ_KEY=gsk_...
+export NOTEGEN_ANTHROPIC_KEY=sk-ant-...
+export NOTEGEN_GEMINI_KEY=AIzaSy...
+```
+Config keys take priority over env vars. Env vars are useful for CI or server use.
+## Caching
+Transcripts and LLM-generated notes are cached in `~/.cache/notegen/` (keyed on URL + model). Re-running the same source skips fetch and LLM calls entirely.
+```bash
+notegen video <url>          # first run: fetches + generates + caches
+notegen video <url>          # second run: serves from cache instantly
+notegen video <url> --no-cache   # bypass cache for this run
+notegen cache clear          # wipe all cached files
+```
+## Dry run
+Estimate tokens and cost before committing to a run:
+```bash
+notegen -n https://youtube.com/playlist?list=...
+```
+Prints a Rich table with chunk count, token count, estimated cost, and estimated generation time. No LLM calls are made, no files are written.
+## Output formats
+Use `--format` to target different note-taking apps:
+| Format | Syntax style |
+|---|---|
+| `obsidian` (default) | `[[wikilinks]]`, `> [!TIP]` callouts, mermaid diagrams |
+| `logseq` | Bullet-based, `#+BEGIN_TIP` blocks |
+| `plain` | Clean markdown, no app-specific syntax |
+| `roam` | `#[[hashtag refs]]` |
+## Watch mode
+Drop files into a folder and notegen auto-processes them:
+```bash
+notegen watch ./inbox --output-dir ./notes
+```
+- Processes existing unprocessed `.txt`/`.md` files on startup
+- Watches for new files; processes each as it appears
+- Tracks processed files in `.watch-state.json` (won't reprocess on restart)
+- Ctrl+C exits cleanly
+## Playlist resume
+Long playlists are resumable. Progress is saved to `.progress.json` in the output folder after each video. If a run is interrupted, re-running the same command skips already-completed videos.
+```bash
+notegen playlist <url>            # resumes from where it left off
+notegen playlist <url> --force-restart   # ignore progress, reprocess all
+```
 ## Rate limiting
 Free-tier providers (Groq, Gemini, Together AI, etc.) enforce strict TPM/RPM limits. notegen handles 429 errors automatically:
@@ -175,12 +279,13 @@ With the defaults (`max_retries: 5`, `retry_base_delay: 60`), the wait sequence
 ## Output format
-Obsidian-flavored markdown:
+Obsidian-flavored markdown (default):
 - YAML frontmatter (`title`, `source`, `type`, `tags`, `date`)
 - `##` / `###` headings only
 - `> [!TIP]` / `> [!WARNING]` callouts
 - Mermaid diagrams for flows and architectures
 - `[[wikilinks]]` for cross-references
+- Tags auto-inferred by LLM from content
 - Playlist → folder + `index.md` with wikilinks to each video note
 ## Requirements

notegen-2.0.0/README.md ADDED Viewed

@@ -0,0 +1,261 @@
+# notegen
+[![CI](https://github.com/moneytosms/notegen/actions/workflows/ci.yml/badge.svg)](https://github.com/moneytosms/notegen/actions/workflows/ci.yml)
+Convert YouTube videos, playlists, and web pages into structured Obsidian-flavored markdown notes using LLMs.
+## Install
+```bash
+pip install notegen
+```
+## Quick start
+```bash
+# 1. Create config
+notegen config init
+# 2. Open config and add your API key
+notegen config open
+# 3. Verify everything works
+notegen doctor
+# 4. Generate notes
+notegen https://youtube.com/watch?v=...
+```
+## Usage
+```bash
+# Auto-detect source type (bare URL or file)
+notegen https://youtube.com/watch?v=...
+notegen https://youtube.com/playlist?list=...
+notegen https://example.com/article
+notegen transcript.txt
+# Explicit commands
+notegen video <youtube-url>
+notegen playlist <playlist-url> [--force] [--force-restart]
+notegen web <url>
+notegen text <file-or-stdin>
+notegen text -                          # stdin
+# Watch a folder — auto-process new .txt/.md files
+notegen watch ./inbox
+# Dry run — estimate tokens/cost without calling LLM
+notegen -n https://youtube.com/watch?v=...
+notegen video <url> --dry-run
+# Output format
+notegen video <url> --format logseq
+notegen web <url> --format plain
+# Config
+notegen config init      # create config file
+notegen config open      # open config in your default editor
+notegen config show      # print resolved config
+notegen config validate  # check structure + API key presence
+notegen doctor           # config check + real test API call
+# Cache
+notegen cache clear      # remove ~/.cache/notegen/
+```
+## Options
+| Flag | Description |
+|---|---|
+| `-o / --output-dir PATH` | Override output directory |
+| `-m / --model TEXT` | LiteLLM model string (e.g. `groq/llama-3.3-70b-versatile`) |
+| `-v / --verbose` | Show chunk count, token usage, model/key selection, crawl status |
+| `--no-mermaid` | Disable mermaid diagram generation |
+| `--no-cache` | Skip cache read/write for this run |
+| `-n / --dry-run` | Print token/cost estimate; skip LLM call |
+| `--format TEXT` | Output format: `obsidian` (default) · `logseq` · `plain` · `roam` |
+| `--force` | Skip playlist videos without captions instead of aborting |
+| `--force-restart` | Ignore playlist resume file, reprocess all videos |
+## Config file
+### Location
+| OS | Path |
+|---|---|
+| Linux | `~/.config/notes-gen/config.yaml` |
+| macOS | `~/.config/notes-gen/config.yaml` |
+| Windows | `%USERPROFILE%\.config\notes-gen\config.yaml` |
+Run `notegen config init` to generate a fully-commented template, then `notegen config open` to edit it.
+### Full reference (`~/.config/notes-gen/config.yaml`)
+```yaml
+# Active model — format: <provider>/<model-name>
+model: anthropic/claude-sonnet-4-6
+# Output
+output_dir: ~/notes
+mermaid: true
+# Output format: obsidian (default) | logseq | plain | roam
+output_format: obsidian
+# Caching — transcripts + LLM output cached in ~/.cache/notegen/
+# Set to false to always re-fetch and re-generate
+cache: true
+# Token budget — compress output if it exceeds this many tokens (0 = no limit)
+max_output_tokens: 0
+# Fuzzy dedup — skip near-duplicate sections in merged notes (Jaccard threshold)
+merger_similarity_threshold: 0.7
+# API key rotation — add multiple keys per provider.
+# notegen picks one at random each request (useful for free-tier rate limits).
+api_keys:
+  anthropic:
+    - sk-ant-api03-KEY1
+    - sk-ant-api03-KEY2   # second key rotated in automatically
+  groq:
+    - gsk_KEY1
+  openai:
+    - sk-proj-KEY1
+  gemini:
+    - AIzaSyKEY1
+  nvidia_nim:
+    - nvapi-KEY1
+  mistral:
+    - KEY1
+  cohere:
+    - KEY1
+  together_ai:
+    - KEY1
+  deepseek:
+    - sk-KEY1
+  perplexity:
+    - pplx-KEY1
+  xai:
+    - xai-KEY1
+# Web crawl limits
+max_concurrent: 5
+web_max_pages: 50
+web_max_depth: 3
+# Rate limiting & retry (important for free-tier providers like Groq, Gemini)
+max_retries: 5
+retry_base_delay: 60.0   # seconds; backoff = base * 2^attempt
+```
+### Supported providers
+| Provider | Model string example |
+|---|---|
+| Anthropic | `anthropic/claude-sonnet-4-6` |
+| OpenAI | `openai/gpt-4o` |
+| Groq | `groq/llama-3.3-70b-versatile` |
+| Google Gemini | `gemini/gemini-2.0-flash` |
+| NVIDIA NIM | `nvidia_nim/meta/llama-3.1-70b-instruct` |
+| Mistral | `mistral/mistral-large-latest` |
+| Cohere | `cohere/command-r-plus` |
+| Together AI | `together_ai/meta-llama/Llama-3-70b-chat-hf` |
+| DeepSeek | `deepseek/deepseek-chat` |
+| Perplexity | `perplexity/sonar-pro` |
+| xAI (Grok) | `xai/grok-2` |
+| Ollama (local) | `ollama/llama3` |
+Any provider supported by [LiteLLM](https://docs.litellm.ai/docs/providers) works.
+## Env var API keys
+As an alternative to the config file, set `NOTEGEN_<PROVIDER>_KEY` env vars. These are used as fallback when no keys are configured for a provider:
+```bash
+export NOTEGEN_GROQ_KEY=gsk_...
+export NOTEGEN_ANTHROPIC_KEY=sk-ant-...
+export NOTEGEN_GEMINI_KEY=AIzaSy...
+```
+Config keys take priority over env vars. Env vars are useful for CI or server use.
+## Caching
+Transcripts and LLM-generated notes are cached in `~/.cache/notegen/` (keyed on URL + model). Re-running the same source skips fetch and LLM calls entirely.
+```bash
+notegen video <url>          # first run: fetches + generates + caches
+notegen video <url>          # second run: serves from cache instantly
+notegen video <url> --no-cache   # bypass cache for this run
+notegen cache clear          # wipe all cached files
+```
+## Dry run
+Estimate tokens and cost before committing to a run:
+```bash
+notegen -n https://youtube.com/playlist?list=...
+```
+Prints a Rich table with chunk count, token count, estimated cost, and estimated generation time. No LLM calls are made, no files are written.
+## Output formats
+Use `--format` to target different note-taking apps:
+| Format | Syntax style |
+|---|---|
+| `obsidian` (default) | `[[wikilinks]]`, `> [!TIP]` callouts, mermaid diagrams |
+| `logseq` | Bullet-based, `#+BEGIN_TIP` blocks |
+| `plain` | Clean markdown, no app-specific syntax |
+| `roam` | `#[[hashtag refs]]` |
+## Watch mode
+Drop files into a folder and notegen auto-processes them:
+```bash
+notegen watch ./inbox --output-dir ./notes
+```
+- Processes existing unprocessed `.txt`/`.md` files on startup
+- Watches for new files; processes each as it appears
+- Tracks processed files in `.watch-state.json` (won't reprocess on restart)
+- Ctrl+C exits cleanly
+## Playlist resume
+Long playlists are resumable. Progress is saved to `.progress.json` in the output folder after each video. If a run is interrupted, re-running the same command skips already-completed videos.
+```bash
+notegen playlist <url>            # resumes from where it left off
+notegen playlist <url> --force-restart   # ignore progress, reprocess all
+```
+## Rate limiting
+Free-tier providers (Groq, Gemini, Together AI, etc.) enforce strict TPM/RPM limits. notegen handles 429 errors automatically:
+1. Cools down the offending key and rotates to another available key immediately.
+2. If all keys for the provider are exhausted, waits using the `Retry-After` header value (if present) or exponential backoff (`retry_base_delay * 2^attempt`), then retries.
+With the defaults (`max_retries: 5`, `retry_base_delay: 60`), the wait sequence is 60s → 120s → 240s → 480s → 960s. Adding multiple API keys from different free accounts is the most effective way to stay under limits.
+## Output format
+Obsidian-flavored markdown (default):
+- YAML frontmatter (`title`, `source`, `type`, `tags`, `date`)
+- `##` / `###` headings only
+- `> [!TIP]` / `> [!WARNING]` callouts
+- Mermaid diagrams for flows and architectures
+- `[[wikilinks]]` for cross-references
+- Tags auto-inferred by LLM from content
+- Playlist → folder + `index.md` with wikilinks to each video note
+## Requirements
+- Python ≥ 3.11
+- API key for at least one supported LLM provider

notegen-2.0.0/notes_gen/cache.py ADDED Viewed

@@ -0,0 +1,63 @@
+from __future__ import annotations
+import hashlib
+import json
+from datetime import datetime, timezone
+from pathlib import Path
+_CACHE_DIR = Path.home() / ".cache" / "notegen"
+def _key(data: str) -> str:
+    return hashlib.sha256(data.encode()).hexdigest()
+def _cache_file(key: str) -> Path:
+    return _CACHE_DIR / f"{key}.json"
+def _read(key: str) -> str | None:
+    f = _cache_file(key)
+    if not f.exists():
+        return None
+    try:
+        return json.loads(f.read_text(encoding="utf-8")).get("content")
+    except Exception:
+        return None
+def _write(key: str, content: str, url: str) -> None:
+    _CACHE_DIR.mkdir(parents=True, exist_ok=True)
+    _cache_file(key).write_text(
+        json.dumps({
+            "content": content,
+            "url": url,
+            "cached_at": datetime.now(timezone.utc).isoformat(),
+        }),
+        encoding="utf-8",
+    )
+def get_transcript_cache(url: str) -> str | None:
+    return _read(_key(f"transcript:{url}"))
+def set_transcript_cache(url: str, text: str) -> None:
+    _write(_key(f"transcript:{url}"), text, url)
+def get_notes_cache(url: str, model: str) -> str | None:
+    return _read(_key(f"notes:{url}:{model}"))
+def set_notes_cache(url: str, model: str, notes: str) -> None:
+    _write(_key(f"notes:{url}:{model}"), notes, url)
+def clear_cache() -> int:
+    if not _CACHE_DIR.exists():
+        return 0
+    files = list(_CACHE_DIR.glob("*.json"))
+    for f in files:
+        f.unlink()
+    return len(files)

notegen 1.0.2__tar.gz → 2.0.0__tar.gz

notegen 1.0.2tar.gz → 2.0.0tar.gz