dual-wordcloud 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dual_wordcloud-0.1.0/.gitignore +6 -0
- dual_wordcloud-0.1.0/PKG-INFO +136 -0
- dual_wordcloud-0.1.0/README.ko.md +123 -0
- dual_wordcloud-0.1.0/README.md +123 -0
- dual_wordcloud-0.1.0/pyproject.toml +26 -0
- dual_wordcloud-0.1.0/src/dual_wordcloud/__init__.py +3 -0
- dual_wordcloud-0.1.0/src/dual_wordcloud/_renderer.py +171 -0
- dual_wordcloud-0.1.0/src/dual_wordcloud/_splitter.py +39 -0
- dual_wordcloud-0.1.0/src/dual_wordcloud/core.py +154 -0
- dual_wordcloud-0.1.0/tests/test_core.py +88 -0
- dual_wordcloud-0.1.0/tests/test_renderer.py +71 -0
- dual_wordcloud-0.1.0/tests/test_splitter.py +63 -0
- dual_wordcloud-0.1.0/uv.lock +679 -0
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: dual-wordcloud
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
License: MIT
|
|
5
|
+
Requires-Python: >=3.12
|
|
6
|
+
Requires-Dist: matplotlib-venn>=0.11.0
|
|
7
|
+
Requires-Dist: matplotlib>=3.8.0
|
|
8
|
+
Requires-Dist: numpy>=1.24.0
|
|
9
|
+
Requires-Dist: pillow>=10.0.0
|
|
10
|
+
Requires-Dist: shapely>=2.0.0
|
|
11
|
+
Requires-Dist: wordcloud>=1.9.0
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
|
|
14
|
+
# dual-wordcloud
|
|
15
|
+
|
|
16
|
+
Venn diagram–style wordcloud that splits keywords across three regions: left-only, shared (center), and right-only.
|
|
17
|
+
|
|
18
|
+
## Installation
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
pip install dual-wordcloud
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
## Usage
|
|
25
|
+
|
|
26
|
+
### Mode 1: Direct regions (`from_regions`)
|
|
27
|
+
|
|
28
|
+
Use when you already have keywords pre-divided into three groups — e.g. positive / neutral / negative sentiment.
|
|
29
|
+
|
|
30
|
+
```python
|
|
31
|
+
from collections import Counter
|
|
32
|
+
from dual_wordcloud import DualWordCloud
|
|
33
|
+
|
|
34
|
+
positive = Counter({"성장": 42, "혁신": 35, "안정": 28})
|
|
35
|
+
neutral = Counter({"금리": 20, "실적": 18})
|
|
36
|
+
negative = Counter({"손실": 30, "위기": 25, "부채": 15})
|
|
37
|
+
|
|
38
|
+
dwc = DualWordCloud.from_regions(
|
|
39
|
+
left=positive,
|
|
40
|
+
center=neutral,
|
|
41
|
+
right=negative,
|
|
42
|
+
left_label="긍정",
|
|
43
|
+
right_label="부정",
|
|
44
|
+
left_border_color="#3498db",
|
|
45
|
+
right_border_color="#e74c3c",
|
|
46
|
+
)
|
|
47
|
+
dwc.to_file("sentiment.png")
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
### Mode 2: Comparison (`from_comparison`)
|
|
51
|
+
|
|
52
|
+
Use when you have two raw keyword counters and want to compare them. Keyword placement (left / center / right) is determined automatically by normalized frequency ratio.
|
|
53
|
+
|
|
54
|
+
```python
|
|
55
|
+
from collections import Counter
|
|
56
|
+
from dual_wordcloud import DualWordCloud
|
|
57
|
+
|
|
58
|
+
bnk = Counter({"대출": 120, "금리": 95, "부채": 40, "성장": 60})
|
|
59
|
+
hana = Counter({"예금": 110, "금리": 88, "투자": 75, "성장": 58})
|
|
60
|
+
|
|
61
|
+
dwc = DualWordCloud.from_comparison(
|
|
62
|
+
left=bnk,
|
|
63
|
+
right=hana,
|
|
64
|
+
count_left=1000, # total articles for BNK (for normalization)
|
|
65
|
+
count_right=850, # total articles for Hana
|
|
66
|
+
left_label="BNK",
|
|
67
|
+
right_label="하나",
|
|
68
|
+
)
|
|
69
|
+
dwc.to_file("comparison.png")
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
Keywords that appear predominantly in one source land in that source's circle. Keywords with similar frequency in both land in the center intersection.
|
|
73
|
+
|
|
74
|
+
### Output
|
|
75
|
+
|
|
76
|
+
```python
|
|
77
|
+
dwc.to_file("output.png") # save PNG, returns Path
|
|
78
|
+
dwc.to_image() # PIL Image (for further processing)
|
|
79
|
+
dwc.show() # open in system viewer
|
|
80
|
+
dwc # inline display in Jupyter Notebook
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
## Parameters
|
|
84
|
+
|
|
85
|
+
### `from_regions(left, center, right, **kwargs)`
|
|
86
|
+
|
|
87
|
+
| Parameter | Type | Default | Description |
|
|
88
|
+
|-----------|------|---------|-------------|
|
|
89
|
+
| `left` | `Counter[str]` | required | Left circle keywords |
|
|
90
|
+
| `center` | `Counter[str]` | required | Intersection keywords |
|
|
91
|
+
| `right` | `Counter[str]` | required | Right circle keywords |
|
|
92
|
+
| `left_label` | `str` | `"Left"` | Left circle label |
|
|
93
|
+
| `right_label` | `str` | `"Right"` | Right circle label |
|
|
94
|
+
| `word_colors` | `dict[str, str] \| None` | `None` | Per-word hex colors (highest priority) |
|
|
95
|
+
| `colormap` | `str \| None` | `None` | matplotlib colormap name (e.g. `"Reds"`) |
|
|
96
|
+
| `font_path` | `str \| Path \| None` | `None` | Font file path. Auto-detected if `None` |
|
|
97
|
+
| `left_border_color` | `str` | `"#2980b9"` | Left circle border color |
|
|
98
|
+
| `right_border_color` | `str` | `"#e74c3c"` | Right circle border color |
|
|
99
|
+
| `left_word_color` | `str` | `"#2980b9"` | Left region word fallback color |
|
|
100
|
+
| `right_word_color` | `str` | `"#e74c3c"` | Right region word fallback color |
|
|
101
|
+
| `center_word_color` | `str` | `"#95a5a6"` | Center region word fallback color |
|
|
102
|
+
| `quality_scale` | `int` | `2` | Render quality 1–3 |
|
|
103
|
+
|
|
104
|
+
### `from_comparison(left, right, count_left, count_right, **kwargs)`
|
|
105
|
+
|
|
106
|
+
Same as `from_regions` plus:
|
|
107
|
+
|
|
108
|
+
| Parameter | Type | Default | Description |
|
|
109
|
+
|-----------|------|---------|-------------|
|
|
110
|
+
| `count_left` | `int` | required | Total document count for left (normalization denominator) |
|
|
111
|
+
| `count_right` | `int` | required | Total document count for right |
|
|
112
|
+
| `ratio_threshold` | `float` | `2.0` | Frequency ratio above which a keyword is placed exclusively in one circle |
|
|
113
|
+
|
|
114
|
+
### Word color priority
|
|
115
|
+
|
|
116
|
+
```
|
|
117
|
+
word_colors[keyword] → per-word color (highest)
|
|
118
|
+
colormap → matplotlib colormap
|
|
119
|
+
*_word_color → region fallback color (lowest)
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
## Korean font
|
|
123
|
+
|
|
124
|
+
The renderer auto-detects common Korean system fonts (AppleSDGothicNeo, NanumGothic, Malgun Gothic). To use a specific font:
|
|
125
|
+
|
|
126
|
+
```python
|
|
127
|
+
dwc = DualWordCloud.from_regions(
|
|
128
|
+
...,
|
|
129
|
+
font_path="/path/to/NanumGothic.ttf",
|
|
130
|
+
)
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
## Requirements
|
|
134
|
+
|
|
135
|
+
- Python 3.12+
|
|
136
|
+
- matplotlib, matplotlib-venn, wordcloud, shapely, Pillow, numpy
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
# dual-wordcloud
|
|
2
|
+
|
|
3
|
+
벤 다이어그램 스타일의 워드클라우드. 키워드를 세 영역(왼쪽 전용 / 공통(교집합) / 오른쪽 전용)으로 나눠 시각화합니다.
|
|
4
|
+
|
|
5
|
+
## 설치
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pip install dual-wordcloud
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## 사용법
|
|
12
|
+
|
|
13
|
+
### 모드 1: 영역 직접 지정 (`from_regions`)
|
|
14
|
+
|
|
15
|
+
키워드를 세 그룹으로 미리 나눠놓은 경우 사용합니다. 긍정/중립/부정 감성 분류가 대표적인 예입니다.
|
|
16
|
+
|
|
17
|
+
```python
|
|
18
|
+
from collections import Counter
|
|
19
|
+
from dual_wordcloud import DualWordCloud
|
|
20
|
+
|
|
21
|
+
긍정 = Counter({"성장": 42, "혁신": 35, "안정": 28})
|
|
22
|
+
중립 = Counter({"금리": 20, "실적": 18})
|
|
23
|
+
부정 = Counter({"손실": 30, "위기": 25, "부채": 15})
|
|
24
|
+
|
|
25
|
+
dwc = DualWordCloud.from_regions(
|
|
26
|
+
left=긍정,
|
|
27
|
+
center=중립,
|
|
28
|
+
right=부정,
|
|
29
|
+
left_label="긍정",
|
|
30
|
+
right_label="부정",
|
|
31
|
+
left_border_color="#3498db",
|
|
32
|
+
right_border_color="#e74c3c",
|
|
33
|
+
)
|
|
34
|
+
dwc.to_file("sentiment.png")
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
### 모드 2: 두 대상 비교 (`from_comparison`)
|
|
38
|
+
|
|
39
|
+
두 Counter를 넣으면 키워드 배치(왼쪽/교집합/오른쪽)를 자동으로 계산합니다. 정규화 빈도 비율 기준으로 분류되므로 기사 수가 다른 두 대상도 공정하게 비교됩니다.
|
|
40
|
+
|
|
41
|
+
```python
|
|
42
|
+
from collections import Counter
|
|
43
|
+
from dual_wordcloud import DualWordCloud
|
|
44
|
+
|
|
45
|
+
bnk = Counter({"대출": 120, "금리": 95, "부채": 40, "성장": 60})
|
|
46
|
+
hana = Counter({"예금": 110, "금리": 88, "투자": 75, "성장": 58})
|
|
47
|
+
|
|
48
|
+
dwc = DualWordCloud.from_comparison(
|
|
49
|
+
left=bnk,
|
|
50
|
+
right=hana,
|
|
51
|
+
count_left=1000, # BNK 전체 기사 수 (정규화 분모)
|
|
52
|
+
count_right=850, # 하나 전체 기사 수
|
|
53
|
+
left_label="BNK",
|
|
54
|
+
right_label="하나",
|
|
55
|
+
)
|
|
56
|
+
dwc.to_file("comparison.png")
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
한쪽에서 압도적으로 많이 나오는 키워드는 해당 원 안에, 두 대상에서 비슷한 빈도로 나오는 키워드는 교집합에 배치됩니다.
|
|
60
|
+
|
|
61
|
+
### 출력
|
|
62
|
+
|
|
63
|
+
```python
|
|
64
|
+
dwc.to_file("output.png") # PNG 파일 저장, Path 반환
|
|
65
|
+
dwc.to_image() # PIL Image 반환 (추가 가공 등)
|
|
66
|
+
dwc.show() # 시스템 뷰어로 바로 확인
|
|
67
|
+
dwc # Jupyter Notebook 셀에서 인라인 표시
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
## 파라미터
|
|
71
|
+
|
|
72
|
+
### `from_regions(left, center, right, **kwargs)`
|
|
73
|
+
|
|
74
|
+
| 파라미터 | 타입 | 기본값 | 설명 |
|
|
75
|
+
|----------|------|--------|------|
|
|
76
|
+
| `left` | `Counter[str]` | 필수 | 왼쪽 원 키워드 |
|
|
77
|
+
| `center` | `Counter[str]` | 필수 | 교집합 키워드 |
|
|
78
|
+
| `right` | `Counter[str]` | 필수 | 오른쪽 원 키워드 |
|
|
79
|
+
| `left_label` | `str` | `"Left"` | 왼쪽 원 라벨 |
|
|
80
|
+
| `right_label` | `str` | `"Right"` | 오른쪽 원 라벨 |
|
|
81
|
+
| `word_colors` | `dict[str, str] \| None` | `None` | 키워드별 hex 색상 (최우선) |
|
|
82
|
+
| `colormap` | `str \| None` | `None` | matplotlib colormap명 (예: `"Reds"`) |
|
|
83
|
+
| `font_path` | `str \| Path \| None` | `None` | 폰트 경로. `None`이면 자동탐색 |
|
|
84
|
+
| `left_border_color` | `str` | `"#2980b9"` | 왼쪽 원 테두리색 |
|
|
85
|
+
| `right_border_color` | `str` | `"#e74c3c"` | 오른쪽 원 테두리색 |
|
|
86
|
+
| `left_word_color` | `str` | `"#2980b9"` | 왼쪽 단어 폴백색 |
|
|
87
|
+
| `right_word_color` | `str` | `"#e74c3c"` | 오른쪽 단어 폴백색 |
|
|
88
|
+
| `center_word_color` | `str` | `"#95a5a6"` | 교집합 단어 폴백색 |
|
|
89
|
+
| `quality_scale` | `int` | `2` | 렌더링 품질 1~3 |
|
|
90
|
+
|
|
91
|
+
### `from_comparison(left, right, count_left, count_right, **kwargs)`
|
|
92
|
+
|
|
93
|
+
`from_regions`의 모든 파라미터에 추가로:
|
|
94
|
+
|
|
95
|
+
| 파라미터 | 타입 | 기본값 | 설명 |
|
|
96
|
+
|----------|------|--------|------|
|
|
97
|
+
| `count_left` | `int` | 필수 | 왼쪽 전체 문서 수 (정규화 분모) |
|
|
98
|
+
| `count_right` | `int` | 필수 | 오른쪽 전체 문서 수 |
|
|
99
|
+
| `ratio_threshold` | `float` | `2.0` | 이 비율 초과 시 한쪽 원에 단독 배치 |
|
|
100
|
+
|
|
101
|
+
### 단어 색상 우선순위
|
|
102
|
+
|
|
103
|
+
```
|
|
104
|
+
word_colors[키워드] → 키워드별 개별 색상 (최우선)
|
|
105
|
+
colormap → matplotlib colormap 적용
|
|
106
|
+
*_word_color → 영역별 폴백 색상 (최후순위)
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
## 한글 폰트
|
|
110
|
+
|
|
111
|
+
AppleSDGothicNeo, NanumGothic, Malgun Gothic 등 주요 한글 폰트를 자동으로 탐색합니다. 특정 폰트를 지정하려면:
|
|
112
|
+
|
|
113
|
+
```python
|
|
114
|
+
dwc = DualWordCloud.from_regions(
|
|
115
|
+
...,
|
|
116
|
+
font_path="/path/to/NanumGothic.ttf",
|
|
117
|
+
)
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
## 요구사항
|
|
121
|
+
|
|
122
|
+
- Python 3.12+
|
|
123
|
+
- matplotlib, matplotlib-venn, wordcloud, shapely, Pillow, numpy
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
# dual-wordcloud
|
|
2
|
+
|
|
3
|
+
Venn diagram–style wordcloud that splits keywords across three regions: left-only, shared (center), and right-only.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pip install dual-wordcloud
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Usage
|
|
12
|
+
|
|
13
|
+
### Mode 1: Direct regions (`from_regions`)
|
|
14
|
+
|
|
15
|
+
Use when you already have keywords pre-divided into three groups — e.g. positive / neutral / negative sentiment.
|
|
16
|
+
|
|
17
|
+
```python
|
|
18
|
+
from collections import Counter
|
|
19
|
+
from dual_wordcloud import DualWordCloud
|
|
20
|
+
|
|
21
|
+
positive = Counter({"성장": 42, "혁신": 35, "안정": 28})
|
|
22
|
+
neutral = Counter({"금리": 20, "실적": 18})
|
|
23
|
+
negative = Counter({"손실": 30, "위기": 25, "부채": 15})
|
|
24
|
+
|
|
25
|
+
dwc = DualWordCloud.from_regions(
|
|
26
|
+
left=positive,
|
|
27
|
+
center=neutral,
|
|
28
|
+
right=negative,
|
|
29
|
+
left_label="긍정",
|
|
30
|
+
right_label="부정",
|
|
31
|
+
left_border_color="#3498db",
|
|
32
|
+
right_border_color="#e74c3c",
|
|
33
|
+
)
|
|
34
|
+
dwc.to_file("sentiment.png")
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
### Mode 2: Comparison (`from_comparison`)
|
|
38
|
+
|
|
39
|
+
Use when you have two raw keyword counters and want to compare them. Keyword placement (left / center / right) is determined automatically by normalized frequency ratio.
|
|
40
|
+
|
|
41
|
+
```python
|
|
42
|
+
from collections import Counter
|
|
43
|
+
from dual_wordcloud import DualWordCloud
|
|
44
|
+
|
|
45
|
+
bnk = Counter({"대출": 120, "금리": 95, "부채": 40, "성장": 60})
|
|
46
|
+
hana = Counter({"예금": 110, "금리": 88, "투자": 75, "성장": 58})
|
|
47
|
+
|
|
48
|
+
dwc = DualWordCloud.from_comparison(
|
|
49
|
+
left=bnk,
|
|
50
|
+
right=hana,
|
|
51
|
+
count_left=1000, # total articles for BNK (for normalization)
|
|
52
|
+
count_right=850, # total articles for Hana
|
|
53
|
+
left_label="BNK",
|
|
54
|
+
right_label="하나",
|
|
55
|
+
)
|
|
56
|
+
dwc.to_file("comparison.png")
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
Keywords that appear predominantly in one source land in that source's circle. Keywords with similar frequency in both land in the center intersection.
|
|
60
|
+
|
|
61
|
+
### Output
|
|
62
|
+
|
|
63
|
+
```python
|
|
64
|
+
dwc.to_file("output.png") # save PNG, returns Path
|
|
65
|
+
dwc.to_image() # PIL Image (for further processing)
|
|
66
|
+
dwc.show() # open in system viewer
|
|
67
|
+
dwc # inline display in Jupyter Notebook
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
## Parameters
|
|
71
|
+
|
|
72
|
+
### `from_regions(left, center, right, **kwargs)`
|
|
73
|
+
|
|
74
|
+
| Parameter | Type | Default | Description |
|
|
75
|
+
|-----------|------|---------|-------------|
|
|
76
|
+
| `left` | `Counter[str]` | required | Left circle keywords |
|
|
77
|
+
| `center` | `Counter[str]` | required | Intersection keywords |
|
|
78
|
+
| `right` | `Counter[str]` | required | Right circle keywords |
|
|
79
|
+
| `left_label` | `str` | `"Left"` | Left circle label |
|
|
80
|
+
| `right_label` | `str` | `"Right"` | Right circle label |
|
|
81
|
+
| `word_colors` | `dict[str, str] \| None` | `None` | Per-word hex colors (highest priority) |
|
|
82
|
+
| `colormap` | `str \| None` | `None` | matplotlib colormap name (e.g. `"Reds"`) |
|
|
83
|
+
| `font_path` | `str \| Path \| None` | `None` | Font file path. Auto-detected if `None` |
|
|
84
|
+
| `left_border_color` | `str` | `"#2980b9"` | Left circle border color |
|
|
85
|
+
| `right_border_color` | `str` | `"#e74c3c"` | Right circle border color |
|
|
86
|
+
| `left_word_color` | `str` | `"#2980b9"` | Left region word fallback color |
|
|
87
|
+
| `right_word_color` | `str` | `"#e74c3c"` | Right region word fallback color |
|
|
88
|
+
| `center_word_color` | `str` | `"#95a5a6"` | Center region word fallback color |
|
|
89
|
+
| `quality_scale` | `int` | `2` | Render quality 1–3 |
|
|
90
|
+
|
|
91
|
+
### `from_comparison(left, right, count_left, count_right, **kwargs)`
|
|
92
|
+
|
|
93
|
+
Same as `from_regions` plus:
|
|
94
|
+
|
|
95
|
+
| Parameter | Type | Default | Description |
|
|
96
|
+
|-----------|------|---------|-------------|
|
|
97
|
+
| `count_left` | `int` | required | Total document count for left (normalization denominator) |
|
|
98
|
+
| `count_right` | `int` | required | Total document count for right |
|
|
99
|
+
| `ratio_threshold` | `float` | `2.0` | Frequency ratio above which a keyword is placed exclusively in one circle |
|
|
100
|
+
|
|
101
|
+
### Word color priority
|
|
102
|
+
|
|
103
|
+
```
|
|
104
|
+
word_colors[keyword] → per-word color (highest)
|
|
105
|
+
colormap → matplotlib colormap
|
|
106
|
+
*_word_color → region fallback color (lowest)
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
## Korean font
|
|
110
|
+
|
|
111
|
+
The renderer auto-detects common Korean system fonts (AppleSDGothicNeo, NanumGothic, Malgun Gothic). To use a specific font:
|
|
112
|
+
|
|
113
|
+
```python
|
|
114
|
+
dwc = DualWordCloud.from_regions(
|
|
115
|
+
...,
|
|
116
|
+
font_path="/path/to/NanumGothic.ttf",
|
|
117
|
+
)
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
## Requirements
|
|
121
|
+
|
|
122
|
+
- Python 3.12+
|
|
123
|
+
- matplotlib, matplotlib-venn, wordcloud, shapely, Pillow, numpy
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "dual-wordcloud"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
requires-python = ">=3.12"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
license = { text = "MIT" }
|
|
7
|
+
dependencies = [
|
|
8
|
+
"wordcloud>=1.9.0",
|
|
9
|
+
"matplotlib>=3.8.0",
|
|
10
|
+
"matplotlib-venn>=0.11.0",
|
|
11
|
+
"shapely>=2.0.0",
|
|
12
|
+
"pillow>=10.0.0",
|
|
13
|
+
"numpy>=1.24.0",
|
|
14
|
+
]
|
|
15
|
+
|
|
16
|
+
[dependency-groups]
|
|
17
|
+
dev = [
|
|
18
|
+
"pytest>=8.0.0",
|
|
19
|
+
]
|
|
20
|
+
|
|
21
|
+
[build-system]
|
|
22
|
+
requires = ["hatchling"]
|
|
23
|
+
build-backend = "hatchling.build"
|
|
24
|
+
|
|
25
|
+
[tool.hatch.build.targets.wheel]
|
|
26
|
+
packages = ["src/dual_wordcloud"]
|
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
"""듀얼 워드클라우드 렌더링 모듈."""
|
|
2
|
+
import io
|
|
3
|
+
from collections import Counter
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
import matplotlib
|
|
7
|
+
matplotlib.use("Agg")
|
|
8
|
+
|
|
9
|
+
import matplotlib.pyplot as plt
|
|
10
|
+
import matplotlib.font_manager as fm
|
|
11
|
+
import numpy as np
|
|
12
|
+
import shapely.geometry as geom
|
|
13
|
+
from matplotlib_venn import venn2
|
|
14
|
+
from PIL import Image, ImageDraw
|
|
15
|
+
from wordcloud import WordCloud
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _resolve_font_path() -> Path | None:
|
|
19
|
+
"""시스템에서 사용 가능한 폰트를 자동으로 찾는다."""
|
|
20
|
+
candidates = [
|
|
21
|
+
Path("/System/Library/Fonts/AppleSDGothicNeo.ttc"),
|
|
22
|
+
Path("/Library/Fonts/NanumGothic.ttf"),
|
|
23
|
+
Path("/usr/share/fonts/truetype/nanum/NanumGothic.ttf"),
|
|
24
|
+
Path("C:/Windows/Fonts/malgun.ttf"),
|
|
25
|
+
]
|
|
26
|
+
for p in candidates:
|
|
27
|
+
if p.exists():
|
|
28
|
+
return p
|
|
29
|
+
return None
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _make_color_func(color_lookup: dict[str, str], fallback: str) -> callable:
|
|
33
|
+
"""키워드별 색상 매핑 함수. 매핑에 없는 단어는 fallback 색상을 사용한다."""
|
|
34
|
+
def color_func(word: str, **kwargs) -> str:
|
|
35
|
+
return color_lookup.get(word, fallback)
|
|
36
|
+
return color_func
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _make_region_color_func(color: str) -> callable:
|
|
40
|
+
"""영역 전체를 단일 색상으로 칠하는 함수를 반환한다."""
|
|
41
|
+
return lambda word, **kwargs: color
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def render(
|
|
45
|
+
left: Counter,
|
|
46
|
+
center: Counter,
|
|
47
|
+
right: Counter,
|
|
48
|
+
left_label: str,
|
|
49
|
+
right_label: str,
|
|
50
|
+
word_colors: dict[str, str] | None,
|
|
51
|
+
colormap: str | None,
|
|
52
|
+
left_border_color: str,
|
|
53
|
+
right_border_color: str,
|
|
54
|
+
left_word_color: str,
|
|
55
|
+
right_word_color: str,
|
|
56
|
+
center_word_color: str,
|
|
57
|
+
font_path: Path | None,
|
|
58
|
+
quality_scale: int,
|
|
59
|
+
) -> Image.Image:
|
|
60
|
+
"""세 Counter를 받아 듀얼 워드클라우드 PIL Image를 반환한다."""
|
|
61
|
+
if font_path is None:
|
|
62
|
+
font_path = _resolve_font_path()
|
|
63
|
+
|
|
64
|
+
scale = max(1, min(int(quality_scale), 3))
|
|
65
|
+
border_colors = [left_border_color, right_border_color]
|
|
66
|
+
word_colors_by_region = [left_word_color, right_word_color, center_word_color]
|
|
67
|
+
|
|
68
|
+
if not any([left, center, right]):
|
|
69
|
+
raise ValueError("left, center, right 중 하나 이상은 비어있지 않아야 합니다")
|
|
70
|
+
|
|
71
|
+
global_max = max(
|
|
72
|
+
max(left.values(), default=0),
|
|
73
|
+
max(center.values(), default=0),
|
|
74
|
+
max(right.values(), default=0),
|
|
75
|
+
) or 1
|
|
76
|
+
left = Counter({k: v / global_max for k, v in left.items()})
|
|
77
|
+
center = Counter({k: v / global_max for k, v in center.items()})
|
|
78
|
+
right = Counter({k: v / global_max for k, v in right.items()})
|
|
79
|
+
|
|
80
|
+
fig_width, fig_height, dpi = 25, 15, 300
|
|
81
|
+
|
|
82
|
+
rc_params = {"axes.unicode_minus": False}
|
|
83
|
+
if font_path:
|
|
84
|
+
font_prop = fm.FontProperties(fname=str(font_path))
|
|
85
|
+
rc_params["font.family"] = font_prop.get_name()
|
|
86
|
+
|
|
87
|
+
with matplotlib.rc_context(rc_params):
|
|
88
|
+
fig, ax = plt.subplots(figsize=(fig_width, fig_height), dpi=dpi)
|
|
89
|
+
venn = venn2(subsets=(10, 10, 5), set_labels=(left_label, right_label), ax=ax)
|
|
90
|
+
|
|
91
|
+
subset_data = {
|
|
92
|
+
frozenset({0}): left,
|
|
93
|
+
frozenset({1}): right,
|
|
94
|
+
frozenset({0, 1}): center,
|
|
95
|
+
}
|
|
96
|
+
subset_keys = [frozenset({0}), frozenset({1}), frozenset({0, 1})]
|
|
97
|
+
|
|
98
|
+
for idx, patch in enumerate(venn.patches):
|
|
99
|
+
if patch is None:
|
|
100
|
+
continue
|
|
101
|
+
patch.set_facecolor("none")
|
|
102
|
+
if idx < len(border_colors):
|
|
103
|
+
patch.set_edgecolor(border_colors[idx])
|
|
104
|
+
patch.set_linewidth(2)
|
|
105
|
+
else:
|
|
106
|
+
patch.set_linewidth(0)
|
|
107
|
+
|
|
108
|
+
for label in venn.subset_labels:
|
|
109
|
+
if label:
|
|
110
|
+
label.set_visible(False)
|
|
111
|
+
|
|
112
|
+
for idx, patch in enumerate(venn.patches):
|
|
113
|
+
if idx >= len(subset_keys):
|
|
114
|
+
continue
|
|
115
|
+
counter = subset_data.get(subset_keys[idx], Counter())
|
|
116
|
+
if not counter:
|
|
117
|
+
continue
|
|
118
|
+
|
|
119
|
+
path = patch.get_path()
|
|
120
|
+
if path is None:
|
|
121
|
+
continue
|
|
122
|
+
vertices = path.vertices
|
|
123
|
+
if len(vertices) < 3:
|
|
124
|
+
continue
|
|
125
|
+
|
|
126
|
+
polygon = geom.Polygon(vertices)
|
|
127
|
+
min_x, min_y, max_x, max_y = polygon.bounds
|
|
128
|
+
width = max(100, min(800, int((max_x - min_x) * dpi * scale)))
|
|
129
|
+
height = max(100, min(600, int((max_y - min_y) * dpi * scale)))
|
|
130
|
+
|
|
131
|
+
img_mask = Image.new("L", (width, height), 255)
|
|
132
|
+
draw = ImageDraw.Draw(img_mask)
|
|
133
|
+
x_range = max_x - min_x if max_x > min_x else 1
|
|
134
|
+
y_range = max_y - min_y if max_y > min_y else 1
|
|
135
|
+
px_vertices = [
|
|
136
|
+
(int((x - min_x) / x_range * (width - 1)),
|
|
137
|
+
int((max_y - y) / y_range * (height - 1)))
|
|
138
|
+
for x, y in vertices
|
|
139
|
+
]
|
|
140
|
+
draw.polygon(px_vertices, fill=0)
|
|
141
|
+
mask = np.array(img_mask)
|
|
142
|
+
|
|
143
|
+
region_word_color = word_colors_by_region[idx] if idx < len(word_colors_by_region) else "#95a5a6"
|
|
144
|
+
wc_kwargs: dict = {}
|
|
145
|
+
if word_colors:
|
|
146
|
+
region_lookup = {k: v for k, v in word_colors.items() if k in counter}
|
|
147
|
+
wc_kwargs["color_func"] = _make_color_func(region_lookup, fallback=region_word_color)
|
|
148
|
+
elif colormap:
|
|
149
|
+
wc_kwargs["colormap"] = colormap
|
|
150
|
+
else:
|
|
151
|
+
wc_kwargs["color_func"] = _make_region_color_func(region_word_color)
|
|
152
|
+
|
|
153
|
+
wc = WordCloud(
|
|
154
|
+
font_path=str(font_path) if font_path else None,
|
|
155
|
+
width=width, height=height, mask=mask,
|
|
156
|
+
mode="RGBA", background_color=None,
|
|
157
|
+
prefer_horizontal=0.7, min_font_size=8,
|
|
158
|
+
max_font_size=int(48 * scale),
|
|
159
|
+
**wc_kwargs,
|
|
160
|
+
)
|
|
161
|
+
wc.generate_from_frequencies(counter)
|
|
162
|
+
ax.imshow(np.array(wc), extent=[min_x, max_x, min_y, max_y],
|
|
163
|
+
aspect="auto", alpha=0.9, zorder=10)
|
|
164
|
+
|
|
165
|
+
buf = io.BytesIO()
|
|
166
|
+
fig.savefig(buf, format="png", dpi=dpi, bbox_inches="tight",
|
|
167
|
+
pad_inches=0.1, transparent=False, facecolor="white")
|
|
168
|
+
plt.close(fig)
|
|
169
|
+
|
|
170
|
+
buf.seek(0)
|
|
171
|
+
return Image.open(buf).copy()
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
from collections import Counter
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def split_for_comparison(
|
|
5
|
+
a: Counter,
|
|
6
|
+
b: Counter,
|
|
7
|
+
count_a: int,
|
|
8
|
+
count_b: int,
|
|
9
|
+
ratio_threshold: float = 2.0,
|
|
10
|
+
) -> tuple[Counter, Counter, Counter]:
|
|
11
|
+
"""두 Counter를 정규화 빈도 기반으로 left/center/right로 분리한다.
|
|
12
|
+
|
|
13
|
+
각 키워드를 기사 수로 정규화한 뒤 비율 임계값으로 배치를 결정한다:
|
|
14
|
+
- 한쪽에만 있는 단어: 해당 side (정규화 빈도)
|
|
15
|
+
- norm_a / norm_b > ratio_threshold: left (norm_a)
|
|
16
|
+
- norm_b / norm_a > ratio_threshold: right (norm_b)
|
|
17
|
+
- 그 외 (비슷한 빈도): center ((norm_a + norm_b) / 2)
|
|
18
|
+
"""
|
|
19
|
+
all_keys = set(a.keys()) | set(b.keys())
|
|
20
|
+
left: Counter = Counter()
|
|
21
|
+
center: Counter = Counter()
|
|
22
|
+
right: Counter = Counter()
|
|
23
|
+
|
|
24
|
+
for k in all_keys:
|
|
25
|
+
norm_a = a.get(k, 0) / count_a
|
|
26
|
+
norm_b = b.get(k, 0) / count_b
|
|
27
|
+
|
|
28
|
+
if norm_b == 0:
|
|
29
|
+
left[k] = norm_a
|
|
30
|
+
elif norm_a == 0:
|
|
31
|
+
right[k] = norm_b
|
|
32
|
+
elif norm_a / norm_b > ratio_threshold:
|
|
33
|
+
left[k] = norm_a
|
|
34
|
+
elif norm_b / norm_a > ratio_threshold:
|
|
35
|
+
right[k] = norm_b
|
|
36
|
+
else:
|
|
37
|
+
center[k] = (norm_a + norm_b) / 2
|
|
38
|
+
|
|
39
|
+
return left, center, right
|