notegen 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- notegen-0.1.0/.gitignore +11 -0
- notegen-0.1.0/PKG-INFO +91 -0
- notegen-0.1.0/README.md +59 -0
- notegen-0.1.0/notes_gen/__init__.py +0 -0
- notegen-0.1.0/notes_gen/cli.py +166 -0
- notegen-0.1.0/notes_gen/config.py +56 -0
- notegen-0.1.0/notes_gen/output/__init__.py +0 -0
- notegen-0.1.0/notes_gen/output/formatter.py +36 -0
- notegen-0.1.0/notes_gen/output/writer.py +40 -0
- notegen-0.1.0/notes_gen/processing/__init__.py +0 -0
- notegen-0.1.0/notes_gen/processing/chunker.py +32 -0
- notegen-0.1.0/notes_gen/processing/filter.py +32 -0
- notegen-0.1.0/notes_gen/processing/llm.py +46 -0
- notegen-0.1.0/notes_gen/processing/merger.py +56 -0
- notegen-0.1.0/notes_gen/sources/__init__.py +0 -0
- notegen-0.1.0/notes_gen/sources/text.py +58 -0
- notegen-0.1.0/notes_gen/sources/web.py +165 -0
- notegen-0.1.0/notes_gen/sources/youtube.py +162 -0
- notegen-0.1.0/pyproject.toml +71 -0
- notegen-0.1.0/tests/__init__.py +0 -0
- notegen-0.1.0/tests/fixtures/.gitkeep +0 -0
- notegen-0.1.0/tests/fixtures/sample_html.html +30 -0
- notegen-0.1.0/tests/fixtures/sample_transcript.txt +17 -0
- notegen-0.1.0/tests/test_chunker.py +57 -0
- notegen-0.1.0/tests/test_cli.py +100 -0
- notegen-0.1.0/tests/test_config.py +71 -0
- notegen-0.1.0/tests/test_filter.py +47 -0
- notegen-0.1.0/tests/test_formatter.py +85 -0
- notegen-0.1.0/tests/test_llm.py +77 -0
- notegen-0.1.0/tests/test_merger.py +52 -0
- notegen-0.1.0/tests/test_text.py +87 -0
- notegen-0.1.0/tests/test_web.py +153 -0
- notegen-0.1.0/tests/test_writer.py +73 -0
- notegen-0.1.0/tests/test_youtube.py +197 -0
notegen-0.1.0/.gitignore
ADDED
notegen-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: notegen
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Convert YouTube videos, playlists, and web pages into Obsidian markdown notes using LLMs
|
|
5
|
+
Project-URL: Homepage, https://github.com/sidhunair280/notegen
|
|
6
|
+
Project-URL: Bug Tracker, https://github.com/sidhunair280/notegen/issues
|
|
7
|
+
Author-email: Sidhu Nair <sidhunair280@gmail.com>
|
|
8
|
+
License: MIT
|
|
9
|
+
Keywords: knowledge-management,llm,markdown,notes,obsidian,youtube
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Environment :: Console
|
|
12
|
+
Classifier: Intended Audience :: End Users/Desktop
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Topic :: Text Processing :: Markup :: Markdown
|
|
18
|
+
Classifier: Topic :: Utilities
|
|
19
|
+
Requires-Python: >=3.11
|
|
20
|
+
Requires-Dist: anyio>=4
|
|
21
|
+
Requires-Dist: beautifulsoup4>=4.12
|
|
22
|
+
Requires-Dist: httpx>=0.27
|
|
23
|
+
Requires-Dist: litellm>=1.40
|
|
24
|
+
Requires-Dist: pyyaml>=6
|
|
25
|
+
Requires-Dist: rich>=13
|
|
26
|
+
Requires-Dist: tiktoken>=0.7
|
|
27
|
+
Requires-Dist: trafilatura>=1.12
|
|
28
|
+
Requires-Dist: typer>=0.12
|
|
29
|
+
Requires-Dist: youtube-transcript-api>=0.6
|
|
30
|
+
Requires-Dist: yt-dlp>=2024.1
|
|
31
|
+
Description-Content-Type: text/markdown
|
|
32
|
+
|
|
33
|
+
# notegen
|
|
34
|
+
|
|
35
|
+
Convert YouTube videos, playlists, and web pages into structured Obsidian-flavored markdown notes using LLMs.
|
|
36
|
+
|
|
37
|
+
## Install
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
pip install notegen
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
## Usage
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
# Auto-detect source type
|
|
47
|
+
notegen https://youtube.com/watch?v=...
|
|
48
|
+
notegen https://example.com/article
|
|
49
|
+
notegen transcript.txt
|
|
50
|
+
|
|
51
|
+
# Explicit commands
|
|
52
|
+
notegen video <youtube-url>
|
|
53
|
+
notegen playlist <playlist-url> [--force]
|
|
54
|
+
notegen web <url>
|
|
55
|
+
notegen text <file-or-stdin>
|
|
56
|
+
|
|
57
|
+
# Config
|
|
58
|
+
notegen config init # create ~/.config/notes-gen/config.yaml
|
|
59
|
+
notegen config show # print resolved config
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
## Options
|
|
63
|
+
|
|
64
|
+
| Flag | Description |
|
|
65
|
+
|---|---|
|
|
66
|
+
| `-o / --output-dir` | Override output directory |
|
|
67
|
+
| `-m / --model` | LiteLLM model string (e.g. `openai/gpt-4o`) |
|
|
68
|
+
| `--no-mermaid` | Disable mermaid diagram generation |
|
|
69
|
+
| `--force` | Skip playlist videos without captions instead of aborting |
|
|
70
|
+
|
|
71
|
+
## Config file (`~/.config/notes-gen/config.yaml`)
|
|
72
|
+
|
|
73
|
+
```yaml
|
|
74
|
+
output_dir: ~/notes
|
|
75
|
+
model: anthropic/claude-sonnet-4-6
|
|
76
|
+
mermaid: true
|
|
77
|
+
max_concurrent: 5
|
|
78
|
+
web_max_pages: 50
|
|
79
|
+
web_max_depth: 3
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
API keys are read from environment variables (e.g. `ANTHROPIC_API_KEY`, `OPENAI_API_KEY`).
|
|
83
|
+
|
|
84
|
+
## Output format
|
|
85
|
+
|
|
86
|
+
Obsidian-flavored markdown with YAML frontmatter, `## / ###` headings, `> [!TIP]` / `> [!WARNING]` callouts, mermaid diagrams, and `[[wikilinks]]`.
|
|
87
|
+
|
|
88
|
+
## Requirements
|
|
89
|
+
|
|
90
|
+
- Python ≥ 3.11
|
|
91
|
+
- An API key for your chosen LLM provider
|
notegen-0.1.0/README.md
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
# notegen
|
|
2
|
+
|
|
3
|
+
Convert YouTube videos, playlists, and web pages into structured Obsidian-flavored markdown notes using LLMs.
|
|
4
|
+
|
|
5
|
+
## Install
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pip install notegen
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Usage
|
|
12
|
+
|
|
13
|
+
```bash
|
|
14
|
+
# Auto-detect source type
|
|
15
|
+
notegen https://youtube.com/watch?v=...
|
|
16
|
+
notegen https://example.com/article
|
|
17
|
+
notegen transcript.txt
|
|
18
|
+
|
|
19
|
+
# Explicit commands
|
|
20
|
+
notegen video <youtube-url>
|
|
21
|
+
notegen playlist <playlist-url> [--force]
|
|
22
|
+
notegen web <url>
|
|
23
|
+
notegen text <file-or-stdin>
|
|
24
|
+
|
|
25
|
+
# Config
|
|
26
|
+
notegen config init # create ~/.config/notes-gen/config.yaml
|
|
27
|
+
notegen config show # print resolved config
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
## Options
|
|
31
|
+
|
|
32
|
+
| Flag | Description |
|
|
33
|
+
|---|---|
|
|
34
|
+
| `-o / --output-dir` | Override output directory |
|
|
35
|
+
| `-m / --model` | LiteLLM model string (e.g. `openai/gpt-4o`) |
|
|
36
|
+
| `--no-mermaid` | Disable mermaid diagram generation |
|
|
37
|
+
| `--force` | Skip playlist videos without captions instead of aborting |
|
|
38
|
+
|
|
39
|
+
## Config file (`~/.config/notes-gen/config.yaml`)
|
|
40
|
+
|
|
41
|
+
```yaml
|
|
42
|
+
output_dir: ~/notes
|
|
43
|
+
model: anthropic/claude-sonnet-4-6
|
|
44
|
+
mermaid: true
|
|
45
|
+
max_concurrent: 5
|
|
46
|
+
web_max_pages: 50
|
|
47
|
+
web_max_depth: 3
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
API keys are read from environment variables (e.g. `ANTHROPIC_API_KEY`, `OPENAI_API_KEY`).
|
|
51
|
+
|
|
52
|
+
## Output format
|
|
53
|
+
|
|
54
|
+
Obsidian-flavored markdown with YAML frontmatter, `## / ###` headings, `> [!TIP]` / `> [!WARNING]` callouts, mermaid diagrams, and `[[wikilinks]]`.
|
|
55
|
+
|
|
56
|
+
## Requirements
|
|
57
|
+
|
|
58
|
+
- Python ≥ 3.11
|
|
59
|
+
- An API key for your chosen LLM provider
|
|
File without changes
|
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
import typer
|
|
5
|
+
import yaml
|
|
6
|
+
|
|
7
|
+
from notes_gen.config import DEFAULT_CONFIG_PATH, Config, load_config, merge_cli_overrides
|
|
8
|
+
|
|
9
|
+
app = typer.Typer(no_args_is_help=True, help="Convert YouTube/web content to Obsidian notes.")
|
|
10
|
+
config_app = typer.Typer(help="Manage configuration.")
|
|
11
|
+
app.add_typer(config_app, name="config")
|
|
12
|
+
|
|
13
|
+
_KNOWN_SUBCOMMANDS = {"video", "playlist", "web", "text", "config", "auto"}
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _run_auto(source: str, cfg: Config, force: bool = False) -> None:
|
|
17
|
+
if "youtube.com/playlist" in source or ("list=" in source and "youtube.com" in source):
|
|
18
|
+
from notes_gen.sources.youtube import run_playlist_pipeline
|
|
19
|
+
|
|
20
|
+
index_path = run_playlist_pipeline(source, cfg, force=force)
|
|
21
|
+
typer.echo(f"Playlist notes written to {index_path.parent}")
|
|
22
|
+
elif "youtube.com/watch" in source or "youtu.be/" in source:
|
|
23
|
+
from notes_gen.sources.youtube import run_video_pipeline
|
|
24
|
+
|
|
25
|
+
output_path = run_video_pipeline(source, cfg)
|
|
26
|
+
typer.echo(f"Notes written to {output_path}")
|
|
27
|
+
elif source.startswith("http://") or source.startswith("https://"):
|
|
28
|
+
from notes_gen.sources.web import run_web_crawl_pipeline
|
|
29
|
+
|
|
30
|
+
output_path = run_web_crawl_pipeline(source, cfg)
|
|
31
|
+
typer.echo(f"Notes written to {output_path}")
|
|
32
|
+
else:
|
|
33
|
+
from notes_gen.sources.text import run_text_pipeline
|
|
34
|
+
|
|
35
|
+
output_path = run_text_pipeline(source, cfg)
|
|
36
|
+
typer.echo(f"Notes written to {output_path}")
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@config_app.command("init")
|
|
40
|
+
def config_init() -> None:
|
|
41
|
+
"""Create default config at ~/.config/notes-gen/config.yaml."""
|
|
42
|
+
if DEFAULT_CONFIG_PATH.exists():
|
|
43
|
+
typer.echo(f"Config already exists: {DEFAULT_CONFIG_PATH}")
|
|
44
|
+
raise typer.Exit(1)
|
|
45
|
+
DEFAULT_CONFIG_PATH.parent.mkdir(parents=True, exist_ok=True)
|
|
46
|
+
defaults = Config()
|
|
47
|
+
data = {
|
|
48
|
+
"output_dir": str(defaults.output_dir),
|
|
49
|
+
"model": defaults.model,
|
|
50
|
+
"mermaid": defaults.mermaid,
|
|
51
|
+
"max_concurrent": defaults.max_concurrent,
|
|
52
|
+
"web_max_pages": defaults.web_max_pages,
|
|
53
|
+
"web_max_depth": defaults.web_max_depth,
|
|
54
|
+
}
|
|
55
|
+
DEFAULT_CONFIG_PATH.write_text(yaml.dump(data, default_flow_style=False))
|
|
56
|
+
typer.echo(f"Config written to {DEFAULT_CONFIG_PATH}")
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
@config_app.command("show")
|
|
60
|
+
def config_show() -> None:
|
|
61
|
+
"""Print resolved config as YAML."""
|
|
62
|
+
cfg = load_config()
|
|
63
|
+
data = {
|
|
64
|
+
"output_dir": str(cfg.output_dir),
|
|
65
|
+
"model": cfg.model,
|
|
66
|
+
"mermaid": cfg.mermaid,
|
|
67
|
+
"max_concurrent": cfg.max_concurrent,
|
|
68
|
+
"web_max_pages": cfg.web_max_pages,
|
|
69
|
+
"web_max_depth": cfg.web_max_depth,
|
|
70
|
+
}
|
|
71
|
+
typer.echo(yaml.dump(data, default_flow_style=False), nl=False)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
@app.command()
|
|
75
|
+
def video(
|
|
76
|
+
url: str = typer.Argument(..., help="YouTube video URL"),
|
|
77
|
+
output_dir: Optional[Path] = typer.Option(None, "--output-dir", "-o"),
|
|
78
|
+
model: Optional[str] = typer.Option(None, "--model", "-m"),
|
|
79
|
+
no_mermaid: bool = typer.Option(False, "--no-mermaid"),
|
|
80
|
+
verbose: bool = typer.Option(False, "--verbose", "-v"),
|
|
81
|
+
force: bool = typer.Option(False, "--force"),
|
|
82
|
+
) -> None:
|
|
83
|
+
"""Generate notes from a YouTube video."""
|
|
84
|
+
from notes_gen.sources.youtube import run_video_pipeline
|
|
85
|
+
|
|
86
|
+
cfg = load_config()
|
|
87
|
+
cfg = merge_cli_overrides(cfg, output_dir=output_dir, model=model, mermaid=not no_mermaid)
|
|
88
|
+
output_path = run_video_pipeline(url, cfg)
|
|
89
|
+
typer.echo(f"Notes written to {output_path}")
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
@app.command()
|
|
93
|
+
def playlist(
|
|
94
|
+
url: str = typer.Argument(..., help="YouTube playlist URL"),
|
|
95
|
+
output_dir: Optional[Path] = typer.Option(None, "--output-dir", "-o"),
|
|
96
|
+
model: Optional[str] = typer.Option(None, "--model", "-m"),
|
|
97
|
+
no_mermaid: bool = typer.Option(False, "--no-mermaid"),
|
|
98
|
+
verbose: bool = typer.Option(False, "--verbose", "-v"),
|
|
99
|
+
force: bool = typer.Option(False, "--force"),
|
|
100
|
+
) -> None:
|
|
101
|
+
"""Generate notes from a YouTube playlist."""
|
|
102
|
+
from notes_gen.sources.youtube import run_playlist_pipeline
|
|
103
|
+
|
|
104
|
+
cfg = load_config()
|
|
105
|
+
cfg = merge_cli_overrides(cfg, output_dir=output_dir, model=model, mermaid=not no_mermaid)
|
|
106
|
+
index_path = run_playlist_pipeline(url, cfg, force=force)
|
|
107
|
+
typer.echo(f"Playlist notes written to {index_path.parent}")
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
@app.command()
|
|
111
|
+
def web(
|
|
112
|
+
url: str = typer.Argument(..., help="Web URL to fetch"),
|
|
113
|
+
output_dir: Optional[Path] = typer.Option(None, "--output-dir", "-o"),
|
|
114
|
+
model: Optional[str] = typer.Option(None, "--model", "-m"),
|
|
115
|
+
no_mermaid: bool = typer.Option(False, "--no-mermaid"),
|
|
116
|
+
verbose: bool = typer.Option(False, "--verbose", "-v"),
|
|
117
|
+
) -> None:
|
|
118
|
+
"""Generate notes from a web page (crawls same-domain links)."""
|
|
119
|
+
from notes_gen.sources.web import run_web_crawl_pipeline
|
|
120
|
+
|
|
121
|
+
cfg = load_config()
|
|
122
|
+
cfg = merge_cli_overrides(cfg, output_dir=output_dir, model=model, mermaid=not no_mermaid)
|
|
123
|
+
output_path = run_web_crawl_pipeline(url, cfg)
|
|
124
|
+
typer.echo(f"Notes written to {output_path}")
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
@app.command()
|
|
128
|
+
def text(
|
|
129
|
+
source: str = typer.Argument(..., help="File path or '-' for stdin"),
|
|
130
|
+
output_dir: Optional[Path] = typer.Option(None, "--output-dir", "-o"),
|
|
131
|
+
model: Optional[str] = typer.Option(None, "--model", "-m"),
|
|
132
|
+
no_mermaid: bool = typer.Option(False, "--no-mermaid"),
|
|
133
|
+
verbose: bool = typer.Option(False, "--verbose", "-v"),
|
|
134
|
+
) -> None:
|
|
135
|
+
"""Generate notes from a text file or stdin."""
|
|
136
|
+
from notes_gen.sources.text import run_text_pipeline
|
|
137
|
+
|
|
138
|
+
cfg = load_config()
|
|
139
|
+
cfg = merge_cli_overrides(cfg, output_dir=output_dir, model=model, mermaid=not no_mermaid)
|
|
140
|
+
output_path = run_text_pipeline(source, cfg)
|
|
141
|
+
typer.echo(f"Notes written to {output_path}")
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
@app.command()
|
|
145
|
+
def auto(
|
|
146
|
+
source: str = typer.Argument(..., help="YouTube URL, web URL, or file path"),
|
|
147
|
+
output_dir: Optional[Path] = typer.Option(None, "--output-dir", "-o"),
|
|
148
|
+
model: Optional[str] = typer.Option(None, "--model", "-m"),
|
|
149
|
+
no_mermaid: bool = typer.Option(False, "--no-mermaid"),
|
|
150
|
+
verbose: bool = typer.Option(False, "--verbose", "-v"),
|
|
151
|
+
force: bool = typer.Option(False, "--force"),
|
|
152
|
+
) -> None:
|
|
153
|
+
"""Auto-detect source type and generate notes."""
|
|
154
|
+
cfg = load_config()
|
|
155
|
+
cfg = merge_cli_overrides(cfg, output_dir=output_dir, model=model, mermaid=not no_mermaid)
|
|
156
|
+
_run_auto(source, cfg, force=force)
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def main() -> None:
|
|
160
|
+
"""Entry point: inject 'auto' subcommand when bare URL/file is passed."""
|
|
161
|
+
import sys
|
|
162
|
+
|
|
163
|
+
args = sys.argv[1:]
|
|
164
|
+
if args and not args[0].startswith("-") and args[0] not in _KNOWN_SUBCOMMANDS:
|
|
165
|
+
sys.argv.insert(1, "auto")
|
|
166
|
+
app()
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, replace
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
import yaml
|
|
8
|
+
|
|
9
|
+
DEFAULT_CONFIG_PATH = Path.home() / ".config" / "notes-gen" / "config.yaml"
|
|
10
|
+
|
|
11
|
+
DEFAULT_MODEL = "anthropic/claude-sonnet-4-6"
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class Config:
|
|
16
|
+
output_dir: Path = None # type: ignore[assignment]
|
|
17
|
+
model: str = DEFAULT_MODEL
|
|
18
|
+
mermaid: bool = True
|
|
19
|
+
max_concurrent: int = 5
|
|
20
|
+
web_max_pages: int = 50
|
|
21
|
+
web_max_depth: int = 3
|
|
22
|
+
|
|
23
|
+
def __post_init__(self) -> None:
|
|
24
|
+
if self.output_dir is None:
|
|
25
|
+
self.output_dir = Path.home() / "notes"
|
|
26
|
+
self.output_dir = Path(self.output_dir)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def load_config(path: Path = DEFAULT_CONFIG_PATH) -> Config:
|
|
30
|
+
if not path.exists():
|
|
31
|
+
return Config()
|
|
32
|
+
raw = yaml.safe_load(path.read_text()) or {}
|
|
33
|
+
kwargs: dict = {}
|
|
34
|
+
if "output_dir" in raw:
|
|
35
|
+
kwargs["output_dir"] = Path(raw["output_dir"])
|
|
36
|
+
for field in ("model", "mermaid", "max_concurrent", "web_max_pages", "web_max_depth"):
|
|
37
|
+
if field in raw:
|
|
38
|
+
kwargs[field] = raw[field]
|
|
39
|
+
return Config(**kwargs)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def merge_cli_overrides(
|
|
43
|
+
cfg: Config,
|
|
44
|
+
*,
|
|
45
|
+
output_dir: Optional[Path] = None,
|
|
46
|
+
model: Optional[str] = None,
|
|
47
|
+
mermaid: Optional[bool] = None,
|
|
48
|
+
) -> Config:
|
|
49
|
+
overrides: dict = {}
|
|
50
|
+
if output_dir is not None:
|
|
51
|
+
overrides["output_dir"] = output_dir
|
|
52
|
+
if model is not None:
|
|
53
|
+
overrides["model"] = model
|
|
54
|
+
if mermaid is not None:
|
|
55
|
+
overrides["mermaid"] = mermaid
|
|
56
|
+
return replace(cfg, **overrides)
|
|
File without changes
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
import unicodedata
|
|
5
|
+
from datetime import date
|
|
6
|
+
|
|
7
|
+
import yaml
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def slugify(text: str) -> str:
|
|
11
|
+
if not text:
|
|
12
|
+
return ""
|
|
13
|
+
text = unicodedata.normalize("NFKD", text)
|
|
14
|
+
text = text.encode("ascii", "ignore").decode("ascii")
|
|
15
|
+
text = text.lower()
|
|
16
|
+
text = re.sub(r"[^\w\s-]", "", text)
|
|
17
|
+
text = re.sub(r"[\s_]+", "-", text)
|
|
18
|
+
text = text.strip("-")
|
|
19
|
+
return text
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def build_frontmatter(
|
|
23
|
+
title: str,
|
|
24
|
+
source: str,
|
|
25
|
+
type: str,
|
|
26
|
+
tags: list[str],
|
|
27
|
+
date: date,
|
|
28
|
+
) -> str:
|
|
29
|
+
data = {
|
|
30
|
+
"title": title,
|
|
31
|
+
"source": source,
|
|
32
|
+
"type": type,
|
|
33
|
+
"tags": tags,
|
|
34
|
+
"date": date.isoformat(),
|
|
35
|
+
}
|
|
36
|
+
return f"---\n{yaml.dump(data, default_flow_style=False, allow_unicode=True)}---\n"
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def write_note(
|
|
7
|
+
path: Path,
|
|
8
|
+
content: str,
|
|
9
|
+
overwrite_policy: str = "overwrite",
|
|
10
|
+
) -> Path:
|
|
11
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
12
|
+
|
|
13
|
+
if path.exists():
|
|
14
|
+
if overwrite_policy == "skip":
|
|
15
|
+
return path
|
|
16
|
+
if overwrite_policy == "rename":
|
|
17
|
+
stem = path.stem
|
|
18
|
+
suffix = path.suffix
|
|
19
|
+
counter = 1
|
|
20
|
+
new_path = path.parent / f"{stem}-{counter}{suffix}"
|
|
21
|
+
while new_path.exists():
|
|
22
|
+
counter += 1
|
|
23
|
+
new_path = path.parent / f"{stem}-{counter}{suffix}"
|
|
24
|
+
new_path.write_text(content, encoding="utf-8")
|
|
25
|
+
return new_path
|
|
26
|
+
|
|
27
|
+
path.write_text(content, encoding="utf-8")
|
|
28
|
+
return path
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def write_index(playlist_dir: Path, video_slugs: list[str]) -> Path:
|
|
32
|
+
playlist_dir.mkdir(parents=True, exist_ok=True)
|
|
33
|
+
title = playlist_dir.name.replace("-", " ").title()
|
|
34
|
+
lines = [f"# {title}\n"]
|
|
35
|
+
for slug in video_slugs:
|
|
36
|
+
lines.append(f"- [[{slug}]]")
|
|
37
|
+
content = "\n".join(lines) + "\n"
|
|
38
|
+
index_path = playlist_dir / "index.md"
|
|
39
|
+
index_path.write_text(content, encoding="utf-8")
|
|
40
|
+
return index_path
|
|
File without changes
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import tiktoken
|
|
4
|
+
|
|
5
|
+
_ENCODING = tiktoken.get_encoding("cl100k_base")
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def count_tokens(text: str) -> int:
|
|
9
|
+
return len(_ENCODING.encode(text))
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def chunk_text(text: str, max_tokens: int = 12000, overlap: int = 200) -> list[str]:
|
|
13
|
+
if not text.strip():
|
|
14
|
+
return []
|
|
15
|
+
|
|
16
|
+
tokens = _ENCODING.encode(text)
|
|
17
|
+
total = len(tokens)
|
|
18
|
+
|
|
19
|
+
if total <= max_tokens:
|
|
20
|
+
return [text]
|
|
21
|
+
|
|
22
|
+
chunks: list[str] = []
|
|
23
|
+
start = 0
|
|
24
|
+
while start < total:
|
|
25
|
+
end = min(start + max_tokens, total)
|
|
26
|
+
chunk_tokens = tokens[start:end]
|
|
27
|
+
chunks.append(_ENCODING.decode(chunk_tokens))
|
|
28
|
+
if end >= total:
|
|
29
|
+
break
|
|
30
|
+
start = end - overlap
|
|
31
|
+
|
|
32
|
+
return chunks
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
|
|
5
|
+
_META_PATTERNS = [
|
|
6
|
+
r"(?i)\b(sponsored?|sponsorship)\b.{0,120}",
|
|
7
|
+
r"(?i)\bsmash\s+that\s+(like|subscribe)\b.{0,80}",
|
|
8
|
+
r"(?i)\blike\s+(and\s+)?subscribe\b.{0,80}",
|
|
9
|
+
r"(?i)\bsubscribe\s+(to\s+)?(my|the|this)\s+(channel|newsletter|podcast)\b.{0,80}",
|
|
10
|
+
r"(?i)\bfollow\s+(me\s+)?on\s+(twitter|instagram|tiktok|linkedin|facebook)\b.{0,80}",
|
|
11
|
+
r"(?i)\bcheck\s+out\s+my\s+merch\b.{0,100}",
|
|
12
|
+
r"(?i)\bsupport\s+(me\s+)?on\s+patreon\b.{0,100}",
|
|
13
|
+
r"(?i)\bjoin\s+(my\s+)?discord\b.{0,80}",
|
|
14
|
+
r"(?i)\bget\s+\d+%\s+off\b.{0,120}",
|
|
15
|
+
r"(?i)\buse\s+(code|promo)\s+\w+\s+(for|to\s+get)\b.{0,120}",
|
|
16
|
+
r"(?i)\bnordvpn\b.{0,150}",
|
|
17
|
+
r"(?i)\bteespring\b.{0,100}",
|
|
18
|
+
]
|
|
19
|
+
|
|
20
|
+
_COMPILED = [re.compile(p) for p in _META_PATTERNS]
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def remove_meta(text: str) -> str:
|
|
24
|
+
if not text:
|
|
25
|
+
return text
|
|
26
|
+
lines = text.split("\n")
|
|
27
|
+
cleaned: list[str] = []
|
|
28
|
+
for line in lines:
|
|
29
|
+
if any(pat.search(line) for pat in _COMPILED):
|
|
30
|
+
continue
|
|
31
|
+
cleaned.append(line)
|
|
32
|
+
return "\n".join(cleaned)
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import litellm
|
|
4
|
+
|
|
5
|
+
from notes_gen.config import Config
|
|
6
|
+
|
|
7
|
+
_SYSTEM_PROMPT = """\
|
|
8
|
+
You are an expert note-taker converting transcripts and articles into structured Obsidian notes.
|
|
9
|
+
|
|
10
|
+
Rules:
|
|
11
|
+
- Use Obsidian-flavored markdown: YAML frontmatter (omit here — added externally),
|
|
12
|
+
## and ### headings only
|
|
13
|
+
- Use `> [!TIP]` and `> [!WARNING]` callouts for important insights
|
|
14
|
+
- Use mermaid diagrams for flows and architectures when appropriate
|
|
15
|
+
- Use [[wikilinks]] for cross-references to related concepts
|
|
16
|
+
- Be comprehensive — never truncate to hit a length limit
|
|
17
|
+
- Auto-infer relevant tags from content
|
|
18
|
+
- Write for a technical audience learning the subject
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
_USER_PROMPT_TEMPLATE = """\
|
|
22
|
+
Convert the following content into structured Obsidian notes:
|
|
23
|
+
|
|
24
|
+
<content>
|
|
25
|
+
{chunk}
|
|
26
|
+
</content>
|
|
27
|
+
|
|
28
|
+
Produce well-organized markdown notes with clear headings, key concepts,
|
|
29
|
+
code examples where relevant, and callouts for important points.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def generate_notes(chunks: list[str], cfg: Config) -> str:
|
|
34
|
+
results: list[str] = []
|
|
35
|
+
for chunk in chunks:
|
|
36
|
+
messages = [
|
|
37
|
+
{"role": "system", "content": _SYSTEM_PROMPT},
|
|
38
|
+
{"role": "user", "content": _USER_PROMPT_TEMPLATE.format(chunk=chunk)},
|
|
39
|
+
]
|
|
40
|
+
response = litellm.completion(
|
|
41
|
+
model=cfg.model,
|
|
42
|
+
messages=messages,
|
|
43
|
+
temperature=0.3,
|
|
44
|
+
)
|
|
45
|
+
results.append(response.choices[0].message.content)
|
|
46
|
+
return "\n\n".join(results)
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
|
|
5
|
+
_HEADER_RE = re.compile(r"^(#{1,3})\s+(.+)$", re.MULTILINE)
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def merge(notes: list[str]) -> str:
|
|
9
|
+
if not notes:
|
|
10
|
+
return ""
|
|
11
|
+
|
|
12
|
+
seen_headers: set[str] = set()
|
|
13
|
+
output_sections: list[str] = []
|
|
14
|
+
|
|
15
|
+
for note in notes:
|
|
16
|
+
if not note.strip():
|
|
17
|
+
continue
|
|
18
|
+
sections = _split_into_sections(note)
|
|
19
|
+
for header, body in sections:
|
|
20
|
+
key = header.strip().lower() if header else None
|
|
21
|
+
if key and key in seen_headers:
|
|
22
|
+
continue
|
|
23
|
+
if key:
|
|
24
|
+
seen_headers.add(key)
|
|
25
|
+
output_sections.append((header, body))
|
|
26
|
+
|
|
27
|
+
if not output_sections:
|
|
28
|
+
return ""
|
|
29
|
+
|
|
30
|
+
parts: list[str] = []
|
|
31
|
+
for header, body in output_sections:
|
|
32
|
+
if header:
|
|
33
|
+
parts.append(f"{header}\n\n{body.strip()}")
|
|
34
|
+
else:
|
|
35
|
+
parts.append(body.strip())
|
|
36
|
+
|
|
37
|
+
return "\n\n".join(p for p in parts if p)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _split_into_sections(text: str) -> list[tuple[str, str]]:
|
|
41
|
+
"""Split text into (header, body) pairs. First section may have no header."""
|
|
42
|
+
positions = [(m.start(), m.group()) for m in _HEADER_RE.finditer(text)]
|
|
43
|
+
if not positions:
|
|
44
|
+
return [("", text)]
|
|
45
|
+
|
|
46
|
+
sections: list[tuple[str, str]] = []
|
|
47
|
+
if positions[0][0] > 0:
|
|
48
|
+
sections.append(("", text[: positions[0][0]]))
|
|
49
|
+
|
|
50
|
+
for i, (pos, header) in enumerate(positions):
|
|
51
|
+
header_end = pos + len(header)
|
|
52
|
+
next_pos = positions[i + 1][0] if i + 1 < len(positions) else len(text)
|
|
53
|
+
body = text[header_end:next_pos].lstrip("\n")
|
|
54
|
+
sections.append((header, body))
|
|
55
|
+
|
|
56
|
+
return sections
|
|
File without changes
|