bear-export 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bear_export-0.1.0/PKG-INFO +150 -0
- bear_export-0.1.0/README.md +139 -0
- bear_export-0.1.0/pyproject.toml +12 -0
- bear_export-0.1.0/setup.cfg +4 -0
- bear_export-0.1.0/src/__init__.py +1 -0
- bear_export-0.1.0/src/bear_export.egg-info/PKG-INFO +150 -0
- bear_export-0.1.0/src/bear_export.egg-info/SOURCES.txt +11 -0
- bear_export-0.1.0/src/bear_export.egg-info/dependency_links.txt +1 -0
- bear_export-0.1.0/src/bear_export.egg-info/requires.txt +4 -0
- bear_export-0.1.0/src/bear_export.egg-info/top_level.txt +4 -0
- bear_export-0.1.0/src/converter.py +300 -0
- bear_export-0.1.0/src/organizer.py +225 -0
- bear_export-0.1.0/src/parser.py +191 -0
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: bear-export
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Add your description here
|
|
5
|
+
Requires-Python: >=3.13
|
|
6
|
+
Description-Content-Type: text/markdown
|
|
7
|
+
Requires-Dist: pandas>=2.0.0
|
|
8
|
+
Requires-Dist: click>=8.0.0
|
|
9
|
+
Requires-Dist: pyyaml>=6.0
|
|
10
|
+
Requires-Dist: python-dateutil>=2.8.0
|
|
11
|
+
|
|
12
|
+
# Bear Export CLI Tool
|
|
13
|
+
|
|
14
|
+
A command-line tool to export Bear blog CSV data to organized markdown files.
|
|
15
|
+
|
|
16
|
+
## Installation
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
pip install -r requirements.txt
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
## Usage
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
python main.py [OPTIONS]
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
### Options
|
|
29
|
+
|
|
30
|
+
- `-c, --csv-file TEXT`: Path to Bear blog CSV export file (default: post_export.csv)
|
|
31
|
+
- `-o, --output-dir TEXT`: Output directory name (default: Blog)
|
|
32
|
+
- `--include-drafts`: Include unpublished draft posts
|
|
33
|
+
- `--organize-by [date|tags|none]`: Organize files by date, tags, or no organization (default: none)
|
|
34
|
+
- `--front-matter [yaml|toml|none]`: Front matter format for static site generators (default: yaml)
|
|
35
|
+
|
|
36
|
+
### Examples
|
|
37
|
+
|
|
38
|
+
Basic export (published posts only, flat structure):
|
|
39
|
+
```bash
|
|
40
|
+
python main.py
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
Include drafts and organize by date:
|
|
44
|
+
```bash
|
|
45
|
+
python main.py --include-drafts --organize-by date
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
Organize by tags with TOML front matter:
|
|
49
|
+
```bash
|
|
50
|
+
python main.py --organize-by tags --front-matter toml
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
Custom CSV file and output directory:
|
|
54
|
+
```bash
|
|
55
|
+
python main.py --csv-file my_export.csv --output-dir MyBlog
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## Output Structure
|
|
59
|
+
|
|
60
|
+
### Flat Organization (default)
|
|
61
|
+
```
|
|
62
|
+
Blog/
|
|
63
|
+
├── post1.md
|
|
64
|
+
├── post2.md
|
|
65
|
+
├── post3.md
|
|
66
|
+
└── README.md
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
### Date Organization
|
|
70
|
+
```
|
|
71
|
+
Blog/
|
|
72
|
+
├── 2023/
|
|
73
|
+
│ ├── 01/
|
|
74
|
+
│ │ ├── january-post.md
|
|
75
|
+
│ │ └── another-january-post.md
|
|
76
|
+
│ └── 12/
|
|
77
|
+
│ └── december-post.md
|
|
78
|
+
├── 2024/
|
|
79
|
+
│ └── 03/
|
|
80
|
+
│ └── march-post.md
|
|
81
|
+
└── README.md
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
### Tags Organization
|
|
85
|
+
```
|
|
86
|
+
Blog/
|
|
87
|
+
├── personal/
|
|
88
|
+
│ ├── my-life.md
|
|
89
|
+
│ └── thoughts.md
|
|
90
|
+
├── coding/
|
|
91
|
+
│ ├── python-tips.md
|
|
92
|
+
│ └── vim-guide.md
|
|
93
|
+
├── uncategorized-post.md
|
|
94
|
+
└── README.md
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
## Front Matter
|
|
98
|
+
|
|
99
|
+
The tool generates front matter compatible with static site generators like Hugo, Jekyll, and others.
|
|
100
|
+
|
|
101
|
+
### YAML Front Matter (default)
|
|
102
|
+
```yaml
|
|
103
|
+
---
|
|
104
|
+
title: My Blog Post
|
|
105
|
+
slug: my-blog-post
|
|
106
|
+
date: 2024-03-15T10:30:00
|
|
107
|
+
tags:
|
|
108
|
+
- personal
|
|
109
|
+
- coding
|
|
110
|
+
lang: en
|
|
111
|
+
published: true
|
|
112
|
+
type: post
|
|
113
|
+
uid: abc123
|
|
114
|
+
---
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
### TOML Front Matter
|
|
118
|
+
```toml
|
|
119
|
+
+++
|
|
120
|
+
title = 'My Blog Post'
|
|
121
|
+
slug = 'my-blog-post'
|
|
122
|
+
date = '2024-03-15T10:30:00'
|
|
123
|
+
tags = ['personal', 'coding']
|
|
124
|
+
lang = 'en'
|
|
125
|
+
published = true
|
|
126
|
+
type = 'post'
|
|
127
|
+
uid = 'abc123'
|
|
128
|
+
+++
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
## Features
|
|
132
|
+
|
|
133
|
+
- ✅ Parse Bear blog CSV exports with proper encoding
|
|
134
|
+
- ✅ Generate clean markdown files with front matter
|
|
135
|
+
- ✅ Support for YAML and TOML front matter formats
|
|
136
|
+
- ✅ Multiple organization strategies (flat, by date, by tags)
|
|
137
|
+
- ✅ Filter published/draft posts
|
|
138
|
+
- ✅ Generate index/README file with post listings
|
|
139
|
+
- ✅ Handle special characters and filenames safely
|
|
140
|
+
- ✅ Preserve post metadata (tags, dates, language, etc.)
|
|
141
|
+
|
|
142
|
+
## Requirements
|
|
143
|
+
|
|
144
|
+
- Python 3.7+
|
|
145
|
+
- pandas
|
|
146
|
+
- click
|
|
147
|
+
- pyyaml
|
|
148
|
+
- python-dateutil
|
|
149
|
+
|
|
150
|
+
See `requirements.txt` for exact versions.
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
# Bear Export CLI Tool
|
|
2
|
+
|
|
3
|
+
A command-line tool to export Bear blog CSV data to organized markdown files.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pip install -r requirements.txt
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Usage
|
|
12
|
+
|
|
13
|
+
```bash
|
|
14
|
+
python main.py [OPTIONS]
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
### Options
|
|
18
|
+
|
|
19
|
+
- `-c, --csv-file TEXT`: Path to Bear blog CSV export file (default: post_export.csv)
|
|
20
|
+
- `-o, --output-dir TEXT`: Output directory name (default: Blog)
|
|
21
|
+
- `--include-drafts`: Include unpublished draft posts
|
|
22
|
+
- `--organize-by [date|tags|none]`: Organize files by date, tags, or no organization (default: none)
|
|
23
|
+
- `--front-matter [yaml|toml|none]`: Front matter format for static site generators (default: yaml)
|
|
24
|
+
|
|
25
|
+
### Examples
|
|
26
|
+
|
|
27
|
+
Basic export (published posts only, flat structure):
|
|
28
|
+
```bash
|
|
29
|
+
python main.py
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
Include drafts and organize by date:
|
|
33
|
+
```bash
|
|
34
|
+
python main.py --include-drafts --organize-by date
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
Organize by tags with TOML front matter:
|
|
38
|
+
```bash
|
|
39
|
+
python main.py --organize-by tags --front-matter toml
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
Custom CSV file and output directory:
|
|
43
|
+
```bash
|
|
44
|
+
python main.py --csv-file my_export.csv --output-dir MyBlog
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
## Output Structure
|
|
48
|
+
|
|
49
|
+
### Flat Organization (default)
|
|
50
|
+
```
|
|
51
|
+
Blog/
|
|
52
|
+
├── post1.md
|
|
53
|
+
├── post2.md
|
|
54
|
+
├── post3.md
|
|
55
|
+
└── README.md
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
### Date Organization
|
|
59
|
+
```
|
|
60
|
+
Blog/
|
|
61
|
+
├── 2023/
|
|
62
|
+
│ ├── 01/
|
|
63
|
+
│ │ ├── january-post.md
|
|
64
|
+
│ │ └── another-january-post.md
|
|
65
|
+
│ └── 12/
|
|
66
|
+
│ └── december-post.md
|
|
67
|
+
├── 2024/
|
|
68
|
+
│ └── 03/
|
|
69
|
+
│ └── march-post.md
|
|
70
|
+
└── README.md
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
### Tags Organization
|
|
74
|
+
```
|
|
75
|
+
Blog/
|
|
76
|
+
├── personal/
|
|
77
|
+
│ ├── my-life.md
|
|
78
|
+
│ └── thoughts.md
|
|
79
|
+
├── coding/
|
|
80
|
+
│ ├── python-tips.md
|
|
81
|
+
│ └── vim-guide.md
|
|
82
|
+
├── uncategorized-post.md
|
|
83
|
+
└── README.md
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
## Front Matter
|
|
87
|
+
|
|
88
|
+
The tool generates front matter compatible with static site generators like Hugo, Jekyll, and others.
|
|
89
|
+
|
|
90
|
+
### YAML Front Matter (default)
|
|
91
|
+
```yaml
|
|
92
|
+
---
|
|
93
|
+
title: My Blog Post
|
|
94
|
+
slug: my-blog-post
|
|
95
|
+
date: 2024-03-15T10:30:00
|
|
96
|
+
tags:
|
|
97
|
+
- personal
|
|
98
|
+
- coding
|
|
99
|
+
lang: en
|
|
100
|
+
published: true
|
|
101
|
+
type: post
|
|
102
|
+
uid: abc123
|
|
103
|
+
---
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
### TOML Front Matter
|
|
107
|
+
```toml
|
|
108
|
+
+++
|
|
109
|
+
title = 'My Blog Post'
|
|
110
|
+
slug = 'my-blog-post'
|
|
111
|
+
date = '2024-03-15T10:30:00'
|
|
112
|
+
tags = ['personal', 'coding']
|
|
113
|
+
lang = 'en'
|
|
114
|
+
published = true
|
|
115
|
+
type = 'post'
|
|
116
|
+
uid = 'abc123'
|
|
117
|
+
+++
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
## Features
|
|
121
|
+
|
|
122
|
+
- ✅ Parse Bear blog CSV exports with proper encoding
|
|
123
|
+
- ✅ Generate clean markdown files with front matter
|
|
124
|
+
- ✅ Support for YAML and TOML front matter formats
|
|
125
|
+
- ✅ Multiple organization strategies (flat, by date, by tags)
|
|
126
|
+
- ✅ Filter published/draft posts
|
|
127
|
+
- ✅ Generate index/README file with post listings
|
|
128
|
+
- ✅ Handle special characters and filenames safely
|
|
129
|
+
- ✅ Preserve post metadata (tags, dates, language, etc.)
|
|
130
|
+
|
|
131
|
+
## Requirements
|
|
132
|
+
|
|
133
|
+
- Python 3.7+
|
|
134
|
+
- pandas
|
|
135
|
+
- click
|
|
136
|
+
- pyyaml
|
|
137
|
+
- python-dateutil
|
|
138
|
+
|
|
139
|
+
See `requirements.txt` for exact versions.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# Bear Export Package
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: bear-export
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Add your description here
|
|
5
|
+
Requires-Python: >=3.13
|
|
6
|
+
Description-Content-Type: text/markdown
|
|
7
|
+
Requires-Dist: pandas>=2.0.0
|
|
8
|
+
Requires-Dist: click>=8.0.0
|
|
9
|
+
Requires-Dist: pyyaml>=6.0
|
|
10
|
+
Requires-Dist: python-dateutil>=2.8.0
|
|
11
|
+
|
|
12
|
+
# Bear Export CLI Tool
|
|
13
|
+
|
|
14
|
+
A command-line tool to export Bear blog CSV data to organized markdown files.
|
|
15
|
+
|
|
16
|
+
## Installation
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
pip install -r requirements.txt
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
## Usage
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
python main.py [OPTIONS]
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
### Options
|
|
29
|
+
|
|
30
|
+
- `-c, --csv-file TEXT`: Path to Bear blog CSV export file (default: post_export.csv)
|
|
31
|
+
- `-o, --output-dir TEXT`: Output directory name (default: Blog)
|
|
32
|
+
- `--include-drafts`: Include unpublished draft posts
|
|
33
|
+
- `--organize-by [date|tags|none]`: Organize files by date, tags, or no organization (default: none)
|
|
34
|
+
- `--front-matter [yaml|toml|none]`: Front matter format for static site generators (default: yaml)
|
|
35
|
+
|
|
36
|
+
### Examples
|
|
37
|
+
|
|
38
|
+
Basic export (published posts only, flat structure):
|
|
39
|
+
```bash
|
|
40
|
+
python main.py
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
Include drafts and organize by date:
|
|
44
|
+
```bash
|
|
45
|
+
python main.py --include-drafts --organize-by date
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
Organize by tags with TOML front matter:
|
|
49
|
+
```bash
|
|
50
|
+
python main.py --organize-by tags --front-matter toml
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
Custom CSV file and output directory:
|
|
54
|
+
```bash
|
|
55
|
+
python main.py --csv-file my_export.csv --output-dir MyBlog
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## Output Structure
|
|
59
|
+
|
|
60
|
+
### Flat Organization (default)
|
|
61
|
+
```
|
|
62
|
+
Blog/
|
|
63
|
+
├── post1.md
|
|
64
|
+
├── post2.md
|
|
65
|
+
├── post3.md
|
|
66
|
+
└── README.md
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
### Date Organization
|
|
70
|
+
```
|
|
71
|
+
Blog/
|
|
72
|
+
├── 2023/
|
|
73
|
+
│ ├── 01/
|
|
74
|
+
│ │ ├── january-post.md
|
|
75
|
+
│ │ └── another-january-post.md
|
|
76
|
+
│ └── 12/
|
|
77
|
+
│ └── december-post.md
|
|
78
|
+
├── 2024/
|
|
79
|
+
│ └── 03/
|
|
80
|
+
│ └── march-post.md
|
|
81
|
+
└── README.md
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
### Tags Organization
|
|
85
|
+
```
|
|
86
|
+
Blog/
|
|
87
|
+
├── personal/
|
|
88
|
+
│ ├── my-life.md
|
|
89
|
+
│ └── thoughts.md
|
|
90
|
+
├── coding/
|
|
91
|
+
│ ├── python-tips.md
|
|
92
|
+
│ └── vim-guide.md
|
|
93
|
+
├── uncategorized-post.md
|
|
94
|
+
└── README.md
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
## Front Matter
|
|
98
|
+
|
|
99
|
+
The tool generates front matter compatible with static site generators like Hugo, Jekyll, and others.
|
|
100
|
+
|
|
101
|
+
### YAML Front Matter (default)
|
|
102
|
+
```yaml
|
|
103
|
+
---
|
|
104
|
+
title: My Blog Post
|
|
105
|
+
slug: my-blog-post
|
|
106
|
+
date: 2024-03-15T10:30:00
|
|
107
|
+
tags:
|
|
108
|
+
- personal
|
|
109
|
+
- coding
|
|
110
|
+
lang: en
|
|
111
|
+
published: true
|
|
112
|
+
type: post
|
|
113
|
+
uid: abc123
|
|
114
|
+
---
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
### TOML Front Matter
|
|
118
|
+
```toml
|
|
119
|
+
+++
|
|
120
|
+
title = 'My Blog Post'
|
|
121
|
+
slug = 'my-blog-post'
|
|
122
|
+
date = '2024-03-15T10:30:00'
|
|
123
|
+
tags = ['personal', 'coding']
|
|
124
|
+
lang = 'en'
|
|
125
|
+
published = true
|
|
126
|
+
type = 'post'
|
|
127
|
+
uid = 'abc123'
|
|
128
|
+
+++
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
## Features
|
|
132
|
+
|
|
133
|
+
- ✅ Parse Bear blog CSV exports with proper encoding
|
|
134
|
+
- ✅ Generate clean markdown files with front matter
|
|
135
|
+
- ✅ Support for YAML and TOML front matter formats
|
|
136
|
+
- ✅ Multiple organization strategies (flat, by date, by tags)
|
|
137
|
+
- ✅ Filter published/draft posts
|
|
138
|
+
- ✅ Generate index/README file with post listings
|
|
139
|
+
- ✅ Handle special characters and filenames safely
|
|
140
|
+
- ✅ Preserve post metadata (tags, dates, language, etc.)
|
|
141
|
+
|
|
142
|
+
## Requirements
|
|
143
|
+
|
|
144
|
+
- Python 3.7+
|
|
145
|
+
- pandas
|
|
146
|
+
- click
|
|
147
|
+
- pyyaml
|
|
148
|
+
- python-dateutil
|
|
149
|
+
|
|
150
|
+
See `requirements.txt` for exact versions.
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
src/__init__.py
|
|
4
|
+
src/converter.py
|
|
5
|
+
src/organizer.py
|
|
6
|
+
src/parser.py
|
|
7
|
+
src/bear_export.egg-info/PKG-INFO
|
|
8
|
+
src/bear_export.egg-info/SOURCES.txt
|
|
9
|
+
src/bear_export.egg-info/dependency_links.txt
|
|
10
|
+
src/bear_export.egg-info/requires.txt
|
|
11
|
+
src/bear_export.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1,300 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Markdown Converter Module
|
|
3
|
+
|
|
4
|
+
Converts Bear blog posts to markdown format with front matter.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import re
|
|
8
|
+
from datetime import datetime
|
|
9
|
+
from typing import List, Dict, Any, Optional
|
|
10
|
+
from urllib.parse import urlparse
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class MarkdownConverter:
|
|
14
|
+
"""Converts Bear blog posts to markdown format."""
|
|
15
|
+
|
|
16
|
+
def __init__(self, front_matter_format: str = "yaml"):
|
|
17
|
+
"""
|
|
18
|
+
Initialize converter with front matter format.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
front_matter_format: 'yaml', 'toml', or 'none'
|
|
22
|
+
"""
|
|
23
|
+
self.front_matter_format = front_matter_format.lower()
|
|
24
|
+
|
|
25
|
+
def convert_posts(self, posts: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
26
|
+
"""
|
|
27
|
+
Convert a list of posts to markdown format.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
posts: List of post dictionaries
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
List of posts with added 'markdown_content' field
|
|
34
|
+
"""
|
|
35
|
+
converted_posts = []
|
|
36
|
+
|
|
37
|
+
for post in posts:
|
|
38
|
+
markdown_post = post.copy()
|
|
39
|
+
markdown_post["markdown_content"] = self.convert_post(post)
|
|
40
|
+
markdown_post["filename"] = self.generate_filename(post)
|
|
41
|
+
converted_posts.append(markdown_post)
|
|
42
|
+
|
|
43
|
+
return converted_posts
|
|
44
|
+
|
|
45
|
+
def convert_post(self, post: Dict[str, Any]) -> str:
|
|
46
|
+
"""
|
|
47
|
+
Convert a single post to markdown format.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
post: Post dictionary
|
|
51
|
+
|
|
52
|
+
Returns:
|
|
53
|
+
Complete markdown content with front matter
|
|
54
|
+
"""
|
|
55
|
+
# Generate front matter
|
|
56
|
+
front_matter = self.generate_front_matter(post)
|
|
57
|
+
|
|
58
|
+
# Clean and process content
|
|
59
|
+
content = self.clean_content(post.get("content", ""))
|
|
60
|
+
|
|
61
|
+
# Combine front matter and content
|
|
62
|
+
if self.front_matter_format == "none":
|
|
63
|
+
return content
|
|
64
|
+
else:
|
|
65
|
+
delimiter = "---" if self.front_matter_format == "yaml" else "+++"
|
|
66
|
+
return f"{delimiter}\n{front_matter}\n{delimiter}\n\n{content}"
|
|
67
|
+
|
|
68
|
+
def generate_front_matter(self, post: Dict[str, Any]) -> str:
|
|
69
|
+
"""
|
|
70
|
+
Generate front matter in the specified format.
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
post: Post dictionary
|
|
74
|
+
|
|
75
|
+
Returns:
|
|
76
|
+
Front matter string
|
|
77
|
+
"""
|
|
78
|
+
if self.front_matter_format == "yaml":
|
|
79
|
+
return self._generate_yaml_front_matter(post)
|
|
80
|
+
elif self.front_matter_format == "toml":
|
|
81
|
+
return self._generate_toml_front_matter(post)
|
|
82
|
+
else:
|
|
83
|
+
return ""
|
|
84
|
+
|
|
85
|
+
def _generate_yaml_front_matter(self, post: Dict[str, Any]) -> str:
|
|
86
|
+
"""Generate YAML front matter."""
|
|
87
|
+
lines = []
|
|
88
|
+
|
|
89
|
+
# Title
|
|
90
|
+
if post.get("title"):
|
|
91
|
+
lines.append(f"title: {self._escape_yaml(post['title'])}")
|
|
92
|
+
|
|
93
|
+
# Slug
|
|
94
|
+
if post.get("slug"):
|
|
95
|
+
lines.append(f"slug: {self._escape_yaml(post['slug'])}")
|
|
96
|
+
|
|
97
|
+
# Date (use published date, fallback to first published at)
|
|
98
|
+
date = post.get("published date") or post.get("first published at")
|
|
99
|
+
if date:
|
|
100
|
+
lines.append(f"date: {date}")
|
|
101
|
+
|
|
102
|
+
# Tags
|
|
103
|
+
tags = post.get("all tags", [])
|
|
104
|
+
if tags:
|
|
105
|
+
tags_yaml = "\n ".join([f"- {self._escape_yaml(tag)}" for tag in tags])
|
|
106
|
+
lines.append(f"tags:\n {tags_yaml}")
|
|
107
|
+
|
|
108
|
+
# Language
|
|
109
|
+
if post.get("lang"):
|
|
110
|
+
lines.append(f"lang: {post['lang']}")
|
|
111
|
+
|
|
112
|
+
# Meta description
|
|
113
|
+
if post.get("meta description"):
|
|
114
|
+
lines.append(f"description: {self._escape_yaml(post['meta description'])}")
|
|
115
|
+
|
|
116
|
+
# Meta image
|
|
117
|
+
if post.get("meta image"):
|
|
118
|
+
lines.append(f"image: {self._escape_yaml(post['meta image'])}")
|
|
119
|
+
|
|
120
|
+
# Post type
|
|
121
|
+
if post.get("is page"):
|
|
122
|
+
lines.append("type: page")
|
|
123
|
+
else:
|
|
124
|
+
lines.append("type: post")
|
|
125
|
+
|
|
126
|
+
# Published status
|
|
127
|
+
lines.append(f"published: {str(post.get('publish', False)).lower()}")
|
|
128
|
+
|
|
129
|
+
# UID (for reference)
|
|
130
|
+
if post.get("uid"):
|
|
131
|
+
lines.append(f"uid: {post['uid']}")
|
|
132
|
+
|
|
133
|
+
# Custom CSS class
|
|
134
|
+
if post.get("class name"):
|
|
135
|
+
lines.append(f"className: {self._escape_yaml(post['class name'])}")
|
|
136
|
+
|
|
137
|
+
return "\n".join(lines)
|
|
138
|
+
|
|
139
|
+
def _generate_toml_front_matter(self, post: Dict[str, Any]) -> str:
|
|
140
|
+
"""Generate TOML front matter."""
|
|
141
|
+
lines = []
|
|
142
|
+
|
|
143
|
+
# Title
|
|
144
|
+
if post.get("title"):
|
|
145
|
+
lines.append(f"title = '{self._escape_toml(post['title'])}'")
|
|
146
|
+
|
|
147
|
+
# Slug
|
|
148
|
+
if post.get("slug"):
|
|
149
|
+
lines.append(f"slug = '{self._escape_toml(post['slug'])}'")
|
|
150
|
+
|
|
151
|
+
# Date
|
|
152
|
+
date = post.get("published date") or post.get("first published at")
|
|
153
|
+
if date:
|
|
154
|
+
lines.append(f"date = '{date}'")
|
|
155
|
+
|
|
156
|
+
# Tags
|
|
157
|
+
tags = post.get("all tags", [])
|
|
158
|
+
if tags:
|
|
159
|
+
tags_toml = ", ".join([f"'{self._escape_toml(tag)}'" for tag in tags])
|
|
160
|
+
lines.append(f"tags = [{tags_toml}]")
|
|
161
|
+
|
|
162
|
+
# Language
|
|
163
|
+
if post.get("lang"):
|
|
164
|
+
lines.append(f"lang = '{post['lang']}'")
|
|
165
|
+
|
|
166
|
+
# Meta description
|
|
167
|
+
if post.get("meta description"):
|
|
168
|
+
lines.append(
|
|
169
|
+
f"description = '{self._escape_toml(post['meta description'])}'"
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
# Meta image
|
|
173
|
+
if post.get("meta image"):
|
|
174
|
+
lines.append(f"image = '{self._escape_toml(post['meta image'])}'")
|
|
175
|
+
|
|
176
|
+
# Post type
|
|
177
|
+
post_type = "page" if post.get("is page") else "post"
|
|
178
|
+
lines.append(f"type = '{post_type}'")
|
|
179
|
+
|
|
180
|
+
# Published status
|
|
181
|
+
lines.append(f"published = {str(post.get('publish', False)).lower()}")
|
|
182
|
+
|
|
183
|
+
# UID
|
|
184
|
+
if post.get("uid"):
|
|
185
|
+
lines.append(f"uid = '{post['uid']}'")
|
|
186
|
+
|
|
187
|
+
# Custom CSS class
|
|
188
|
+
if post.get("class name"):
|
|
189
|
+
lines.append(f"className = '{self._escape_toml(post['class name'])}'")
|
|
190
|
+
|
|
191
|
+
return "\n".join(lines)
|
|
192
|
+
|
|
193
|
+
def generate_filename(self, post: Dict[str, Any]) -> str:
|
|
194
|
+
"""
|
|
195
|
+
Generate a safe filename for the post.
|
|
196
|
+
|
|
197
|
+
Args:
|
|
198
|
+
post: Post dictionary
|
|
199
|
+
|
|
200
|
+
Returns:
|
|
201
|
+
Filename with .md extension
|
|
202
|
+
"""
|
|
203
|
+
# Try to use slug first, then title, then UID
|
|
204
|
+
name = post.get("slug") or post.get("title") or post.get("uid", "untitled")
|
|
205
|
+
|
|
206
|
+
# Clean the name
|
|
207
|
+
name = self._clean_filename(name)
|
|
208
|
+
|
|
209
|
+
# Ensure it's not empty
|
|
210
|
+
if not name:
|
|
211
|
+
name = f"post-{post.get('uid', 'unknown')}"
|
|
212
|
+
|
|
213
|
+
return f"{name}.md"
|
|
214
|
+
|
|
215
|
+
def clean_content(self, content: str) -> str:
|
|
216
|
+
"""
|
|
217
|
+
Clean and process post content.
|
|
218
|
+
|
|
219
|
+
Args:
|
|
220
|
+
content: Raw post content
|
|
221
|
+
|
|
222
|
+
Returns:
|
|
223
|
+
Cleaned markdown content
|
|
224
|
+
"""
|
|
225
|
+
if not content:
|
|
226
|
+
return ""
|
|
227
|
+
|
|
228
|
+
# Normalize line endings
|
|
229
|
+
content = content.replace("\r\n", "\n").replace("\r", "\n")
|
|
230
|
+
|
|
231
|
+
# Clean up excessive whitespace
|
|
232
|
+
content = re.sub(r"\n{3,}", "\n\n", content)
|
|
233
|
+
|
|
234
|
+
# Fix common Bear-specific markdown issues
|
|
235
|
+
content = self._fix_bear_markdown(content)
|
|
236
|
+
|
|
237
|
+
# Strip leading/trailing whitespace
|
|
238
|
+
content = content.strip()
|
|
239
|
+
|
|
240
|
+
return content
|
|
241
|
+
|
|
242
|
+
def _fix_bear_markdown(self, content: str) -> str:
|
|
243
|
+
"""Fix Bear-specific markdown issues."""
|
|
244
|
+
# Fix Bear's image syntax:  -> 
|
|
245
|
+
content = re.sub(r"!\[image\]", "![image]", content)
|
|
246
|
+
|
|
247
|
+
# Fix Bear's link syntax issues
|
|
248
|
+
content = re.sub(r"\[([^\]]+)\]\(\s*([^\)]+)\s*\)", r"[\1](\2)", content)
|
|
249
|
+
|
|
250
|
+
# Clean up HTML that Bear might have added
|
|
251
|
+
content = re.sub(r"<br\s*/?>", "\n", content)
|
|
252
|
+
|
|
253
|
+
return content
|
|
254
|
+
|
|
255
|
+
def _clean_filename(self, name: str) -> str:
|
|
256
|
+
"""Clean string to be safe for filename."""
|
|
257
|
+
# Remove or replace unsafe characters
|
|
258
|
+
name = re.sub(r'[<>:"/\\|?*]', "", name)
|
|
259
|
+
|
|
260
|
+
# Replace spaces and other separators with hyphens
|
|
261
|
+
name = re.sub(r"[\s_]+", "-", name)
|
|
262
|
+
|
|
263
|
+
# Remove consecutive hyphens
|
|
264
|
+
name = re.sub(r"-+", "-", name)
|
|
265
|
+
|
|
266
|
+
# Remove leading/trailing hyphens and dots
|
|
267
|
+
name = name.strip("-.")
|
|
268
|
+
|
|
269
|
+
# Convert to lowercase
|
|
270
|
+
name = name.lower()
|
|
271
|
+
|
|
272
|
+
# Limit length
|
|
273
|
+
if len(name) > 50:
|
|
274
|
+
name = name[:50].rstrip("-")
|
|
275
|
+
|
|
276
|
+
return name
|
|
277
|
+
|
|
278
|
+
def _escape_yaml(self, text: str) -> str:
|
|
279
|
+
"""Escape text for YAML."""
|
|
280
|
+
if not text:
|
|
281
|
+
return ""
|
|
282
|
+
|
|
283
|
+
# Basic YAML escaping
|
|
284
|
+
text = str(text).replace("'", "''")
|
|
285
|
+
|
|
286
|
+
# If it contains special characters, quote it
|
|
287
|
+
if any(char in text for char in ":{}[],&*#?|-<>" + " \n\t"):
|
|
288
|
+
return f"'{text}'"
|
|
289
|
+
|
|
290
|
+
return text
|
|
291
|
+
|
|
292
|
+
def _escape_toml(self, text: str) -> str:
|
|
293
|
+
"""Escape text for TOML."""
|
|
294
|
+
if not text:
|
|
295
|
+
return ""
|
|
296
|
+
|
|
297
|
+
# Basic TOML escaping
|
|
298
|
+
text = str(text).replace("\\", "\\\\").replace("'", "\\'")
|
|
299
|
+
|
|
300
|
+
return text
|
|
@@ -0,0 +1,225 @@
|
|
|
1
|
+
"""
|
|
2
|
+
File Organizer Module
|
|
3
|
+
|
|
4
|
+
Organizes markdown files into directory structures.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import os
|
|
8
|
+
import shutil
|
|
9
|
+
from datetime import datetime
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import List, Dict, Any
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class FileOrganizer:
|
|
15
|
+
"""Organizes markdown files into directory structures."""
|
|
16
|
+
|
|
17
|
+
def __init__(self, output_dir: str = "Blog", organization: str = "none"):
|
|
18
|
+
"""
|
|
19
|
+
Initialize file organizer.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
output_dir: Base output directory name
|
|
23
|
+
organization: How to organize files ('date', 'tags', 'none')
|
|
24
|
+
"""
|
|
25
|
+
self.output_dir = Path(output_dir)
|
|
26
|
+
self.organization = organization.lower()
|
|
27
|
+
|
|
28
|
+
# Create output directory if it doesn't exist
|
|
29
|
+
self.output_dir.mkdir(exist_ok=True)
|
|
30
|
+
|
|
31
|
+
def organize_posts(self, posts: List[Dict[str, Any]]) -> None:
|
|
32
|
+
"""
|
|
33
|
+
Organize posts into directory structure and write files.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
posts: List of posts with markdown_content and filename
|
|
37
|
+
"""
|
|
38
|
+
# Clean existing directory if it exists (except for hidden files)
|
|
39
|
+
if self.output_dir.exists():
|
|
40
|
+
self._clean_output_directory()
|
|
41
|
+
|
|
42
|
+
# Organize based on selected method
|
|
43
|
+
if self.organization == "date":
|
|
44
|
+
self._organize_by_date(posts)
|
|
45
|
+
elif self.organization == "tags":
|
|
46
|
+
self._organize_by_tags(posts)
|
|
47
|
+
else:
|
|
48
|
+
self._organize_flat(posts)
|
|
49
|
+
|
|
50
|
+
# Create index file
|
|
51
|
+
self._create_index(posts)
|
|
52
|
+
|
|
53
|
+
def _clean_output_directory(self) -> None:
|
|
54
|
+
"""Clean output directory of non-hidden files."""
|
|
55
|
+
for item in self.output_dir.iterdir():
|
|
56
|
+
if not item.name.startswith("."):
|
|
57
|
+
if item.is_file():
|
|
58
|
+
item.unlink()
|
|
59
|
+
elif item.is_dir():
|
|
60
|
+
shutil.rmtree(item)
|
|
61
|
+
|
|
62
|
+
def _organize_flat(self, posts: List[Dict[str, Any]]) -> None:
|
|
63
|
+
"""Organize posts in a flat structure."""
|
|
64
|
+
for post in posts:
|
|
65
|
+
filename = post.get("filename", "untitled.md")
|
|
66
|
+
content = post.get("markdown_content", "")
|
|
67
|
+
|
|
68
|
+
filepath = self.output_dir / filename
|
|
69
|
+
self._write_file(filepath, content)
|
|
70
|
+
|
|
71
|
+
def _organize_by_date(self, posts: List[Dict[str, Any]]) -> None:
|
|
72
|
+
"""Organize posts by year/month."""
|
|
73
|
+
for post in posts:
|
|
74
|
+
filename = post.get("filename", "untitled.md")
|
|
75
|
+
content = post.get("markdown_content", "")
|
|
76
|
+
|
|
77
|
+
# Get date from post
|
|
78
|
+
date_str = post.get("published date") or post.get("first published at")
|
|
79
|
+
if date_str:
|
|
80
|
+
try:
|
|
81
|
+
# Parse date and create year/month directory
|
|
82
|
+
date = datetime.fromisoformat(date_str.replace("Z", "+00:00"))
|
|
83
|
+
year_dir = self.output_dir / str(date.year)
|
|
84
|
+
month_dir = year_dir / f"{date.month:02d}"
|
|
85
|
+
|
|
86
|
+
year_dir.mkdir(exist_ok=True)
|
|
87
|
+
month_dir.mkdir(exist_ok=True)
|
|
88
|
+
|
|
89
|
+
filepath = month_dir / filename
|
|
90
|
+
except (ValueError, TypeError):
|
|
91
|
+
# Fallback to root if date parsing fails
|
|
92
|
+
filepath = self.output_dir / filename
|
|
93
|
+
else:
|
|
94
|
+
# No date, put in root
|
|
95
|
+
filepath = self.output_dir / filename
|
|
96
|
+
|
|
97
|
+
self._write_file(filepath, content)
|
|
98
|
+
|
|
99
|
+
def _organize_by_tags(self, posts: List[Dict[str, Any]]) -> None:
|
|
100
|
+
"""Organize posts by tags (posts can be in multiple tag directories)."""
|
|
101
|
+
# Create a mapping of tag to posts
|
|
102
|
+
tag_posts = {}
|
|
103
|
+
uncategorized_posts = []
|
|
104
|
+
|
|
105
|
+
for post in posts:
|
|
106
|
+
tags = post.get("all tags", [])
|
|
107
|
+
if tags:
|
|
108
|
+
for tag in tags:
|
|
109
|
+
if tag not in tag_posts:
|
|
110
|
+
tag_posts[tag] = []
|
|
111
|
+
tag_posts[tag].append(post)
|
|
112
|
+
else:
|
|
113
|
+
uncategorized_posts.append(post)
|
|
114
|
+
|
|
115
|
+
# Write posts to tag directories
|
|
116
|
+
for tag, tagged_posts in tag_posts.items():
|
|
117
|
+
# Create tag directory
|
|
118
|
+
tag_dir = self.output_dir / self._clean_tag_name(tag)
|
|
119
|
+
tag_dir.mkdir(exist_ok=True)
|
|
120
|
+
|
|
121
|
+
# Write posts to this tag directory
|
|
122
|
+
for post in tagged_posts:
|
|
123
|
+
filename = post.get("filename", "untitled.md")
|
|
124
|
+
content = post.get("markdown_content", "")
|
|
125
|
+
|
|
126
|
+
filepath = tag_dir / filename
|
|
127
|
+
self._write_file(filepath, content)
|
|
128
|
+
|
|
129
|
+
# Write uncategorized posts to root
|
|
130
|
+
for post in uncategorized_posts:
|
|
131
|
+
filename = post.get("filename", "untitled.md")
|
|
132
|
+
content = post.get("markdown_content", "")
|
|
133
|
+
|
|
134
|
+
filepath = self.output_dir / filename
|
|
135
|
+
self._write_file(filepath, content)
|
|
136
|
+
|
|
137
|
+
def _create_index(self, posts: List[Dict[str, Any]]) -> None:
|
|
138
|
+
"""Create an index file listing all posts."""
|
|
139
|
+
index_content = self._generate_index_content(posts)
|
|
140
|
+
index_path = self.output_dir / "README.md"
|
|
141
|
+
self._write_file(index_path, index_content)
|
|
142
|
+
|
|
143
|
+
def _generate_index_content(self, posts: List[Dict[str, Any]]) -> str:
|
|
144
|
+
"""Generate content for index file."""
|
|
145
|
+
lines = ["# Blog Posts\n"]
|
|
146
|
+
|
|
147
|
+
# Sort posts by date (newest first)
|
|
148
|
+
sorted_posts = sorted(posts, key=self._get_post_date, reverse=True)
|
|
149
|
+
|
|
150
|
+
for post in sorted_posts:
|
|
151
|
+
title = post.get("title", "Untitled")
|
|
152
|
+
filename = post.get("filename", "untitled.md")
|
|
153
|
+
date_str = post.get("published date") or post.get("first published at")
|
|
154
|
+
tags = post.get("all tags", [])
|
|
155
|
+
|
|
156
|
+
# Format date
|
|
157
|
+
date_display = ""
|
|
158
|
+
if date_str:
|
|
159
|
+
try:
|
|
160
|
+
date = datetime.fromisoformat(date_str.replace("Z", "+00:00"))
|
|
161
|
+
date_display = date.strftime("%Y-%m-%d")
|
|
162
|
+
except (ValueError, TypeError):
|
|
163
|
+
date_display = date_str
|
|
164
|
+
|
|
165
|
+
# Format tags
|
|
166
|
+
tags_display = ""
|
|
167
|
+
if tags:
|
|
168
|
+
tags_display = f" - `{'`, `'.join(tags)}`"
|
|
169
|
+
|
|
170
|
+
# Add to index
|
|
171
|
+
line = f"- [{title}]({filename})"
|
|
172
|
+
if date_display:
|
|
173
|
+
line += f" ({date_display})"
|
|
174
|
+
if tags_display:
|
|
175
|
+
line += f" {tags_display}"
|
|
176
|
+
|
|
177
|
+
lines.append(line)
|
|
178
|
+
|
|
179
|
+
# Add summary
|
|
180
|
+
lines.append(f"\n---")
|
|
181
|
+
lines.append(f"**Total posts:** {len(posts)}")
|
|
182
|
+
|
|
183
|
+
published_count = sum(1 for p in posts if p.get("publish", False))
|
|
184
|
+
draft_count = len(posts) - published_count
|
|
185
|
+
|
|
186
|
+
lines.append(f"**Published:** {published_count}")
|
|
187
|
+
lines.append(f"**Drafts:** {draft_count}")
|
|
188
|
+
|
|
189
|
+
return "\n".join(lines)
|
|
190
|
+
|
|
191
|
+
def _write_file(self, filepath: Path, content: str) -> None:
|
|
192
|
+
"""Write content to file, creating directories as needed."""
|
|
193
|
+
# Create parent directories if they don't exist
|
|
194
|
+
filepath.parent.mkdir(parents=True, exist_ok=True)
|
|
195
|
+
|
|
196
|
+
# Write file
|
|
197
|
+
with open(filepath, "w", encoding="utf-8") as f:
|
|
198
|
+
f.write(content)
|
|
199
|
+
|
|
200
|
+
def _clean_tag_name(self, tag: str) -> str:
|
|
201
|
+
"""Clean tag name for directory use."""
|
|
202
|
+
# Remove special characters and replace spaces with hyphens
|
|
203
|
+
import re
|
|
204
|
+
|
|
205
|
+
tag = re.sub(r'[<>:"/\\|?*]', "", tag)
|
|
206
|
+
tag = re.sub(r"[\s_]+", "-", tag)
|
|
207
|
+
tag = re.sub(r"-+", "-", tag)
|
|
208
|
+
tag = tag.strip("-.")
|
|
209
|
+
|
|
210
|
+
# Convert to lowercase
|
|
211
|
+
tag = tag.lower()
|
|
212
|
+
|
|
213
|
+
return tag or "untagged"
|
|
214
|
+
|
|
215
|
+
def _get_post_date(self, post: Dict[str, Any]) -> datetime:
|
|
216
|
+
"""Get datetime for sorting posts."""
|
|
217
|
+
date_str = post.get("published date") or post.get("first published at")
|
|
218
|
+
if date_str:
|
|
219
|
+
try:
|
|
220
|
+
return datetime.fromisoformat(date_str.replace("Z", "+00:00"))
|
|
221
|
+
except (ValueError, TypeError):
|
|
222
|
+
pass
|
|
223
|
+
|
|
224
|
+
# Fallback to very old date for posts without dates
|
|
225
|
+
return datetime(1970, 1, 1)
|
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Bear CSV Parser Module
|
|
3
|
+
|
|
4
|
+
Handles parsing of Bear blog export CSV files with proper encoding and data cleaning.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import pandas as pd
|
|
8
|
+
import json
|
|
9
|
+
from datetime import datetime
|
|
10
|
+
from typing import List, Dict, Any
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class BearCSVParser:
|
|
14
|
+
"""Parser for Bear blog CSV export files."""
|
|
15
|
+
|
|
16
|
+
def __init__(self):
|
|
17
|
+
self.expected_columns = [
|
|
18
|
+
"uid",
|
|
19
|
+
"title",
|
|
20
|
+
"slug",
|
|
21
|
+
"alias",
|
|
22
|
+
"published date",
|
|
23
|
+
"all tags",
|
|
24
|
+
"publish",
|
|
25
|
+
"make discoverable",
|
|
26
|
+
"is page",
|
|
27
|
+
"content",
|
|
28
|
+
"canonical url",
|
|
29
|
+
"meta description",
|
|
30
|
+
"meta image",
|
|
31
|
+
"lang",
|
|
32
|
+
"class name",
|
|
33
|
+
"first published at",
|
|
34
|
+
]
|
|
35
|
+
|
|
36
|
+
def parse(self, csv_file: str) -> List[Dict[str, Any]]:
|
|
37
|
+
"""
|
|
38
|
+
Parse Bear blog CSV file and return list of post dictionaries.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
csv_file: Path to the CSV file
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
List of post dictionaries with cleaned data
|
|
45
|
+
"""
|
|
46
|
+
try:
|
|
47
|
+
# Read CSV with UTF-8 BOM encoding (common in Bear exports)
|
|
48
|
+
df = pd.read_csv(csv_file, encoding="utf-8-sig")
|
|
49
|
+
|
|
50
|
+
# Validate columns
|
|
51
|
+
self._validate_columns(df)
|
|
52
|
+
|
|
53
|
+
# Convert to list of dictionaries and clean data
|
|
54
|
+
posts = []
|
|
55
|
+
for _, row in df.iterrows():
|
|
56
|
+
post = self._clean_post_data(row.to_dict())
|
|
57
|
+
posts.append(post)
|
|
58
|
+
|
|
59
|
+
return posts
|
|
60
|
+
|
|
61
|
+
except UnicodeDecodeError:
|
|
62
|
+
# Fallback to other encodings
|
|
63
|
+
try:
|
|
64
|
+
df = pd.read_csv(csv_file, encoding="utf-8")
|
|
65
|
+
posts = [
|
|
66
|
+
self._clean_post_data(row.to_dict()) for _, row in df.iterrows()
|
|
67
|
+
]
|
|
68
|
+
return posts
|
|
69
|
+
except Exception as e:
|
|
70
|
+
raise ValueError(f"Could not read CSV file with any encoding: {str(e)}")
|
|
71
|
+
except Exception as e:
|
|
72
|
+
raise ValueError(f"Error parsing CSV file: {str(e)}")
|
|
73
|
+
|
|
74
|
+
def _validate_columns(self, df: pd.DataFrame) -> None:
|
|
75
|
+
"""Validate that expected columns exist in the DataFrame."""
|
|
76
|
+
missing_columns = set(self.expected_columns) - set(df.columns)
|
|
77
|
+
if missing_columns:
|
|
78
|
+
raise ValueError(f"Missing expected columns in CSV: {missing_columns}")
|
|
79
|
+
|
|
80
|
+
def _clean_post_data(self, post_data: Dict[str, Any]) -> Dict[str, Any]:
|
|
81
|
+
"""
|
|
82
|
+
Clean and normalize post data.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
post_data: Raw post data from CSV
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
Cleaned post data dictionary
|
|
89
|
+
"""
|
|
90
|
+
cleaned = {}
|
|
91
|
+
|
|
92
|
+
# Basic string fields
|
|
93
|
+
string_fields = [
|
|
94
|
+
"uid",
|
|
95
|
+
"title",
|
|
96
|
+
"slug",
|
|
97
|
+
"alias",
|
|
98
|
+
"canonical url",
|
|
99
|
+
"meta description",
|
|
100
|
+
"meta image",
|
|
101
|
+
"lang",
|
|
102
|
+
"class name",
|
|
103
|
+
]
|
|
104
|
+
|
|
105
|
+
for field in string_fields:
|
|
106
|
+
cleaned[field] = self._clean_string(post_data.get(field, ""))
|
|
107
|
+
|
|
108
|
+
# Date fields
|
|
109
|
+
date_fields = ["published date", "first published at"]
|
|
110
|
+
for field in date_fields:
|
|
111
|
+
cleaned[field] = self._clean_date(post_data.get(field, ""))
|
|
112
|
+
|
|
113
|
+
# Boolean fields
|
|
114
|
+
bool_fields = ["publish", "make discoverable", "is page"]
|
|
115
|
+
for field in bool_fields:
|
|
116
|
+
cleaned[field] = self._clean_boolean(post_data.get(field, False))
|
|
117
|
+
|
|
118
|
+
# Tags field (JSON array)
|
|
119
|
+
cleaned["all tags"] = self._clean_tags(post_data.get("all tags", "[]"))
|
|
120
|
+
|
|
121
|
+
# Content field
|
|
122
|
+
cleaned["content"] = self._clean_content(post_data.get("content", ""))
|
|
123
|
+
|
|
124
|
+
return cleaned
|
|
125
|
+
|
|
126
|
+
def _clean_string(self, value: Any) -> str:
|
|
127
|
+
"""Clean string values."""
|
|
128
|
+
if pd.isna(value) or value is None:
|
|
129
|
+
return ""
|
|
130
|
+
return str(value).strip()
|
|
131
|
+
|
|
132
|
+
def _clean_date(self, value: Any) -> str:
|
|
133
|
+
"""Clean date values."""
|
|
134
|
+
if pd.isna(value) or value is None or value == "":
|
|
135
|
+
return ""
|
|
136
|
+
|
|
137
|
+
try:
|
|
138
|
+
# Try to parse as datetime and format as ISO
|
|
139
|
+
if isinstance(value, str):
|
|
140
|
+
# Handle various date formats
|
|
141
|
+
for fmt in ["%Y-%m-%d %H:%M:%S", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d"]:
|
|
142
|
+
try:
|
|
143
|
+
dt = datetime.strptime(value, fmt)
|
|
144
|
+
return dt.isoformat()
|
|
145
|
+
except ValueError:
|
|
146
|
+
continue
|
|
147
|
+
return str(value)
|
|
148
|
+
except:
|
|
149
|
+
return str(value)
|
|
150
|
+
|
|
151
|
+
def _clean_boolean(self, value: Any) -> bool:
|
|
152
|
+
"""Clean boolean values."""
|
|
153
|
+
if pd.isna(value) or value is None:
|
|
154
|
+
return False
|
|
155
|
+
|
|
156
|
+
if isinstance(value, bool):
|
|
157
|
+
return value
|
|
158
|
+
|
|
159
|
+
if isinstance(value, str):
|
|
160
|
+
return value.lower() in ["true", "yes", "1", "on"]
|
|
161
|
+
|
|
162
|
+
return bool(value)
|
|
163
|
+
|
|
164
|
+
def _clean_tags(self, value: Any) -> List[str]:
|
|
165
|
+
"""Clean tags field (JSON array)."""
|
|
166
|
+
if pd.isna(value) or value is None or value == "":
|
|
167
|
+
return []
|
|
168
|
+
|
|
169
|
+
try:
|
|
170
|
+
if isinstance(value, str):
|
|
171
|
+
tags = json.loads(value)
|
|
172
|
+
if isinstance(tags, list):
|
|
173
|
+
return [
|
|
174
|
+
tag.strip()
|
|
175
|
+
for tag in tags
|
|
176
|
+
if isinstance(tag, str) and tag.strip()
|
|
177
|
+
]
|
|
178
|
+
elif isinstance(value, list):
|
|
179
|
+
return [
|
|
180
|
+
tag.strip() for tag in value if isinstance(tag, str) and tag.strip()
|
|
181
|
+
]
|
|
182
|
+
except (json.JSONDecodeError, TypeError):
|
|
183
|
+
pass
|
|
184
|
+
|
|
185
|
+
return []
|
|
186
|
+
|
|
187
|
+
def _clean_content(self, value: Any) -> str:
|
|
188
|
+
"""Clean content field."""
|
|
189
|
+
if pd.isna(value) or value is None:
|
|
190
|
+
return ""
|
|
191
|
+
return str(value)
|