dual-wordcloud 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,6 @@
1
+ .venv/
2
+ __pycache__/
3
+ *.pyc
4
+ dist/
5
+ .pytest_cache/
6
+ *.egg-info/
@@ -0,0 +1,136 @@
1
+ Metadata-Version: 2.4
2
+ Name: dual-wordcloud
3
+ Version: 0.1.0
4
+ License: MIT
5
+ Requires-Python: >=3.12
6
+ Requires-Dist: matplotlib-venn>=0.11.0
7
+ Requires-Dist: matplotlib>=3.8.0
8
+ Requires-Dist: numpy>=1.24.0
9
+ Requires-Dist: pillow>=10.0.0
10
+ Requires-Dist: shapely>=2.0.0
11
+ Requires-Dist: wordcloud>=1.9.0
12
+ Description-Content-Type: text/markdown
13
+
14
+ # dual-wordcloud
15
+
16
+ Venn diagram–style wordcloud that splits keywords across three regions: left-only, shared (center), and right-only.
17
+
18
+ ## Installation
19
+
20
+ ```bash
21
+ pip install dual-wordcloud
22
+ ```
23
+
24
+ ## Usage
25
+
26
+ ### Mode 1: Direct regions (`from_regions`)
27
+
28
+ Use when you already have keywords pre-divided into three groups — e.g. positive / neutral / negative sentiment.
29
+
30
+ ```python
31
+ from collections import Counter
32
+ from dual_wordcloud import DualWordCloud
33
+
34
+ positive = Counter({"성장": 42, "혁신": 35, "안정": 28})
35
+ neutral = Counter({"금리": 20, "실적": 18})
36
+ negative = Counter({"손실": 30, "위기": 25, "부채": 15})
37
+
38
+ dwc = DualWordCloud.from_regions(
39
+ left=positive,
40
+ center=neutral,
41
+ right=negative,
42
+ left_label="긍정",
43
+ right_label="부정",
44
+ left_border_color="#3498db",
45
+ right_border_color="#e74c3c",
46
+ )
47
+ dwc.to_file("sentiment.png")
48
+ ```
49
+
50
+ ### Mode 2: Comparison (`from_comparison`)
51
+
52
+ Use when you have two raw keyword counters and want to compare them. Keyword placement (left / center / right) is determined automatically by normalized frequency ratio.
53
+
54
+ ```python
55
+ from collections import Counter
56
+ from dual_wordcloud import DualWordCloud
57
+
58
+ bnk = Counter({"대출": 120, "금리": 95, "부채": 40, "성장": 60})
59
+ hana = Counter({"예금": 110, "금리": 88, "투자": 75, "성장": 58})
60
+
61
+ dwc = DualWordCloud.from_comparison(
62
+ left=bnk,
63
+ right=hana,
64
+ count_left=1000, # total articles for BNK (for normalization)
65
+ count_right=850, # total articles for Hana
66
+ left_label="BNK",
67
+ right_label="하나",
68
+ )
69
+ dwc.to_file("comparison.png")
70
+ ```
71
+
72
+ Keywords that appear predominantly in one source land in that source's circle. Keywords with similar frequency in both land in the center intersection.
73
+
74
+ ### Output
75
+
76
+ ```python
77
+ dwc.to_file("output.png") # save PNG, returns Path
78
+ dwc.to_image() # PIL Image (for further processing)
79
+ dwc.show() # open in system viewer
80
+ dwc # inline display in Jupyter Notebook
81
+ ```
82
+
83
+ ## Parameters
84
+
85
+ ### `from_regions(left, center, right, **kwargs)`
86
+
87
+ | Parameter | Type | Default | Description |
88
+ |-----------|------|---------|-------------|
89
+ | `left` | `Counter[str]` | required | Left circle keywords |
90
+ | `center` | `Counter[str]` | required | Intersection keywords |
91
+ | `right` | `Counter[str]` | required | Right circle keywords |
92
+ | `left_label` | `str` | `"Left"` | Left circle label |
93
+ | `right_label` | `str` | `"Right"` | Right circle label |
94
+ | `word_colors` | `dict[str, str] \| None` | `None` | Per-word hex colors (highest priority) |
95
+ | `colormap` | `str \| None` | `None` | matplotlib colormap name (e.g. `"Reds"`) |
96
+ | `font_path` | `str \| Path \| None` | `None` | Font file path. Auto-detected if `None` |
97
+ | `left_border_color` | `str` | `"#2980b9"` | Left circle border color |
98
+ | `right_border_color` | `str` | `"#e74c3c"` | Right circle border color |
99
+ | `left_word_color` | `str` | `"#2980b9"` | Left region word fallback color |
100
+ | `right_word_color` | `str` | `"#e74c3c"` | Right region word fallback color |
101
+ | `center_word_color` | `str` | `"#95a5a6"` | Center region word fallback color |
102
+ | `quality_scale` | `int` | `2` | Render quality 1–3 |
103
+
104
+ ### `from_comparison(left, right, count_left, count_right, **kwargs)`
105
+
106
+ Same as `from_regions` plus:
107
+
108
+ | Parameter | Type | Default | Description |
109
+ |-----------|------|---------|-------------|
110
+ | `count_left` | `int` | required | Total document count for left (normalization denominator) |
111
+ | `count_right` | `int` | required | Total document count for right |
112
+ | `ratio_threshold` | `float` | `2.0` | Frequency ratio above which a keyword is placed exclusively in one circle |
113
+
114
+ ### Word color priority
115
+
116
+ ```
117
+ word_colors[keyword] → per-word color (highest)
118
+ colormap → matplotlib colormap
119
+ *_word_color → region fallback color (lowest)
120
+ ```
121
+
122
+ ## Korean font
123
+
124
+ The renderer auto-detects common Korean system fonts (AppleSDGothicNeo, NanumGothic, Malgun Gothic). To use a specific font:
125
+
126
+ ```python
127
+ dwc = DualWordCloud.from_regions(
128
+ ...,
129
+ font_path="/path/to/NanumGothic.ttf",
130
+ )
131
+ ```
132
+
133
+ ## Requirements
134
+
135
+ - Python 3.12+
136
+ - matplotlib, matplotlib-venn, wordcloud, shapely, Pillow, numpy
@@ -0,0 +1,123 @@
1
+ # dual-wordcloud
2
+
3
+ 벤 다이어그램 스타일의 워드클라우드. 키워드를 세 영역(왼쪽 전용 / 공통(교집합) / 오른쪽 전용)으로 나눠 시각화합니다.
4
+
5
+ ## 설치
6
+
7
+ ```bash
8
+ pip install dual-wordcloud
9
+ ```
10
+
11
+ ## 사용법
12
+
13
+ ### 모드 1: 영역 직접 지정 (`from_regions`)
14
+
15
+ 키워드를 세 그룹으로 미리 나눠놓은 경우 사용합니다. 긍정/중립/부정 감성 분류가 대표적인 예입니다.
16
+
17
+ ```python
18
+ from collections import Counter
19
+ from dual_wordcloud import DualWordCloud
20
+
21
+ 긍정 = Counter({"성장": 42, "혁신": 35, "안정": 28})
22
+ 중립 = Counter({"금리": 20, "실적": 18})
23
+ 부정 = Counter({"손실": 30, "위기": 25, "부채": 15})
24
+
25
+ dwc = DualWordCloud.from_regions(
26
+ left=긍정,
27
+ center=중립,
28
+ right=부정,
29
+ left_label="긍정",
30
+ right_label="부정",
31
+ left_border_color="#3498db",
32
+ right_border_color="#e74c3c",
33
+ )
34
+ dwc.to_file("sentiment.png")
35
+ ```
36
+
37
+ ### 모드 2: 두 대상 비교 (`from_comparison`)
38
+
39
+ 두 Counter를 넣으면 키워드 배치(왼쪽/교집합/오른쪽)를 자동으로 계산합니다. 정규화 빈도 비율 기준으로 분류되므로 기사 수가 다른 두 대상도 공정하게 비교됩니다.
40
+
41
+ ```python
42
+ from collections import Counter
43
+ from dual_wordcloud import DualWordCloud
44
+
45
+ bnk = Counter({"대출": 120, "금리": 95, "부채": 40, "성장": 60})
46
+ hana = Counter({"예금": 110, "금리": 88, "투자": 75, "성장": 58})
47
+
48
+ dwc = DualWordCloud.from_comparison(
49
+ left=bnk,
50
+ right=hana,
51
+ count_left=1000, # BNK 전체 기사 수 (정규화 분모)
52
+ count_right=850, # 하나 전체 기사 수
53
+ left_label="BNK",
54
+ right_label="하나",
55
+ )
56
+ dwc.to_file("comparison.png")
57
+ ```
58
+
59
+ 한쪽에서 압도적으로 많이 나오는 키워드는 해당 원 안에, 두 대상에서 비슷한 빈도로 나오는 키워드는 교집합에 배치됩니다.
60
+
61
+ ### 출력
62
+
63
+ ```python
64
+ dwc.to_file("output.png") # PNG 파일 저장, Path 반환
65
+ dwc.to_image() # PIL Image 반환 (추가 가공 등)
66
+ dwc.show() # 시스템 뷰어로 바로 확인
67
+ dwc # Jupyter Notebook 셀에서 인라인 표시
68
+ ```
69
+
70
+ ## 파라미터
71
+
72
+ ### `from_regions(left, center, right, **kwargs)`
73
+
74
+ | 파라미터 | 타입 | 기본값 | 설명 |
75
+ |----------|------|--------|------|
76
+ | `left` | `Counter[str]` | 필수 | 왼쪽 원 키워드 |
77
+ | `center` | `Counter[str]` | 필수 | 교집합 키워드 |
78
+ | `right` | `Counter[str]` | 필수 | 오른쪽 원 키워드 |
79
+ | `left_label` | `str` | `"Left"` | 왼쪽 원 라벨 |
80
+ | `right_label` | `str` | `"Right"` | 오른쪽 원 라벨 |
81
+ | `word_colors` | `dict[str, str] \| None` | `None` | 키워드별 hex 색상 (최우선) |
82
+ | `colormap` | `str \| None` | `None` | matplotlib colormap명 (예: `"Reds"`) |
83
+ | `font_path` | `str \| Path \| None` | `None` | 폰트 경로. `None`이면 자동탐색 |
84
+ | `left_border_color` | `str` | `"#2980b9"` | 왼쪽 원 테두리색 |
85
+ | `right_border_color` | `str` | `"#e74c3c"` | 오른쪽 원 테두리색 |
86
+ | `left_word_color` | `str` | `"#2980b9"` | 왼쪽 단어 폴백색 |
87
+ | `right_word_color` | `str` | `"#e74c3c"` | 오른쪽 단어 폴백색 |
88
+ | `center_word_color` | `str` | `"#95a5a6"` | 교집합 단어 폴백색 |
89
+ | `quality_scale` | `int` | `2` | 렌더링 품질 1~3 |
90
+
91
+ ### `from_comparison(left, right, count_left, count_right, **kwargs)`
92
+
93
+ `from_regions`의 모든 파라미터에 추가로:
94
+
95
+ | 파라미터 | 타입 | 기본값 | 설명 |
96
+ |----------|------|--------|------|
97
+ | `count_left` | `int` | 필수 | 왼쪽 전체 문서 수 (정규화 분모) |
98
+ | `count_right` | `int` | 필수 | 오른쪽 전체 문서 수 |
99
+ | `ratio_threshold` | `float` | `2.0` | 이 비율 초과 시 한쪽 원에 단독 배치 |
100
+
101
+ ### 단어 색상 우선순위
102
+
103
+ ```
104
+ word_colors[키워드] → 키워드별 개별 색상 (최우선)
105
+ colormap → matplotlib colormap 적용
106
+ *_word_color → 영역별 폴백 색상 (최후순위)
107
+ ```
108
+
109
+ ## 한글 폰트
110
+
111
+ AppleSDGothicNeo, NanumGothic, Malgun Gothic 등 주요 한글 폰트를 자동으로 탐색합니다. 특정 폰트를 지정하려면:
112
+
113
+ ```python
114
+ dwc = DualWordCloud.from_regions(
115
+ ...,
116
+ font_path="/path/to/NanumGothic.ttf",
117
+ )
118
+ ```
119
+
120
+ ## 요구사항
121
+
122
+ - Python 3.12+
123
+ - matplotlib, matplotlib-venn, wordcloud, shapely, Pillow, numpy
@@ -0,0 +1,123 @@
1
+ # dual-wordcloud
2
+
3
+ Venn diagram–style wordcloud that splits keywords across three regions: left-only, shared (center), and right-only.
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ pip install dual-wordcloud
9
+ ```
10
+
11
+ ## Usage
12
+
13
+ ### Mode 1: Direct regions (`from_regions`)
14
+
15
+ Use when you already have keywords pre-divided into three groups — e.g. positive / neutral / negative sentiment.
16
+
17
+ ```python
18
+ from collections import Counter
19
+ from dual_wordcloud import DualWordCloud
20
+
21
+ positive = Counter({"성장": 42, "혁신": 35, "안정": 28})
22
+ neutral = Counter({"금리": 20, "실적": 18})
23
+ negative = Counter({"손실": 30, "위기": 25, "부채": 15})
24
+
25
+ dwc = DualWordCloud.from_regions(
26
+ left=positive,
27
+ center=neutral,
28
+ right=negative,
29
+ left_label="긍정",
30
+ right_label="부정",
31
+ left_border_color="#3498db",
32
+ right_border_color="#e74c3c",
33
+ )
34
+ dwc.to_file("sentiment.png")
35
+ ```
36
+
37
+ ### Mode 2: Comparison (`from_comparison`)
38
+
39
+ Use when you have two raw keyword counters and want to compare them. Keyword placement (left / center / right) is determined automatically by normalized frequency ratio.
40
+
41
+ ```python
42
+ from collections import Counter
43
+ from dual_wordcloud import DualWordCloud
44
+
45
+ bnk = Counter({"대출": 120, "금리": 95, "부채": 40, "성장": 60})
46
+ hana = Counter({"예금": 110, "금리": 88, "투자": 75, "성장": 58})
47
+
48
+ dwc = DualWordCloud.from_comparison(
49
+ left=bnk,
50
+ right=hana,
51
+ count_left=1000, # total articles for BNK (for normalization)
52
+ count_right=850, # total articles for Hana
53
+ left_label="BNK",
54
+ right_label="하나",
55
+ )
56
+ dwc.to_file("comparison.png")
57
+ ```
58
+
59
+ Keywords that appear predominantly in one source land in that source's circle. Keywords with similar frequency in both land in the center intersection.
60
+
61
+ ### Output
62
+
63
+ ```python
64
+ dwc.to_file("output.png") # save PNG, returns Path
65
+ dwc.to_image() # PIL Image (for further processing)
66
+ dwc.show() # open in system viewer
67
+ dwc # inline display in Jupyter Notebook
68
+ ```
69
+
70
+ ## Parameters
71
+
72
+ ### `from_regions(left, center, right, **kwargs)`
73
+
74
+ | Parameter | Type | Default | Description |
75
+ |-----------|------|---------|-------------|
76
+ | `left` | `Counter[str]` | required | Left circle keywords |
77
+ | `center` | `Counter[str]` | required | Intersection keywords |
78
+ | `right` | `Counter[str]` | required | Right circle keywords |
79
+ | `left_label` | `str` | `"Left"` | Left circle label |
80
+ | `right_label` | `str` | `"Right"` | Right circle label |
81
+ | `word_colors` | `dict[str, str] \| None` | `None` | Per-word hex colors (highest priority) |
82
+ | `colormap` | `str \| None` | `None` | matplotlib colormap name (e.g. `"Reds"`) |
83
+ | `font_path` | `str \| Path \| None` | `None` | Font file path. Auto-detected if `None` |
84
+ | `left_border_color` | `str` | `"#2980b9"` | Left circle border color |
85
+ | `right_border_color` | `str` | `"#e74c3c"` | Right circle border color |
86
+ | `left_word_color` | `str` | `"#2980b9"` | Left region word fallback color |
87
+ | `right_word_color` | `str` | `"#e74c3c"` | Right region word fallback color |
88
+ | `center_word_color` | `str` | `"#95a5a6"` | Center region word fallback color |
89
+ | `quality_scale` | `int` | `2` | Render quality 1–3 |
90
+
91
+ ### `from_comparison(left, right, count_left, count_right, **kwargs)`
92
+
93
+ Same as `from_regions` plus:
94
+
95
+ | Parameter | Type | Default | Description |
96
+ |-----------|------|---------|-------------|
97
+ | `count_left` | `int` | required | Total document count for left (normalization denominator) |
98
+ | `count_right` | `int` | required | Total document count for right |
99
+ | `ratio_threshold` | `float` | `2.0` | Frequency ratio above which a keyword is placed exclusively in one circle |
100
+
101
+ ### Word color priority
102
+
103
+ ```
104
+ word_colors[keyword] → per-word color (highest)
105
+ colormap → matplotlib colormap
106
+ *_word_color → region fallback color (lowest)
107
+ ```
108
+
109
+ ## Korean font
110
+
111
+ The renderer auto-detects common Korean system fonts (AppleSDGothicNeo, NanumGothic, Malgun Gothic). To use a specific font:
112
+
113
+ ```python
114
+ dwc = DualWordCloud.from_regions(
115
+ ...,
116
+ font_path="/path/to/NanumGothic.ttf",
117
+ )
118
+ ```
119
+
120
+ ## Requirements
121
+
122
+ - Python 3.12+
123
+ - matplotlib, matplotlib-venn, wordcloud, shapely, Pillow, numpy
@@ -0,0 +1,26 @@
1
+ [project]
2
+ name = "dual-wordcloud"
3
+ version = "0.1.0"
4
+ requires-python = ">=3.12"
5
+ readme = "README.md"
6
+ license = { text = "MIT" }
7
+ dependencies = [
8
+ "wordcloud>=1.9.0",
9
+ "matplotlib>=3.8.0",
10
+ "matplotlib-venn>=0.11.0",
11
+ "shapely>=2.0.0",
12
+ "pillow>=10.0.0",
13
+ "numpy>=1.24.0",
14
+ ]
15
+
16
+ [dependency-groups]
17
+ dev = [
18
+ "pytest>=8.0.0",
19
+ ]
20
+
21
+ [build-system]
22
+ requires = ["hatchling"]
23
+ build-backend = "hatchling.build"
24
+
25
+ [tool.hatch.build.targets.wheel]
26
+ packages = ["src/dual_wordcloud"]
@@ -0,0 +1,3 @@
1
+ from dual_wordcloud.core import DualWordCloud
2
+
3
+ __all__ = ["DualWordCloud"]
@@ -0,0 +1,171 @@
1
+ """듀얼 워드클라우드 렌더링 모듈."""
2
+ import io
3
+ from collections import Counter
4
+ from pathlib import Path
5
+
6
+ import matplotlib
7
+ matplotlib.use("Agg")
8
+
9
+ import matplotlib.pyplot as plt
10
+ import matplotlib.font_manager as fm
11
+ import numpy as np
12
+ import shapely.geometry as geom
13
+ from matplotlib_venn import venn2
14
+ from PIL import Image, ImageDraw
15
+ from wordcloud import WordCloud
16
+
17
+
18
+ def _resolve_font_path() -> Path | None:
19
+ """시스템에서 사용 가능한 폰트를 자동으로 찾는다."""
20
+ candidates = [
21
+ Path("/System/Library/Fonts/AppleSDGothicNeo.ttc"),
22
+ Path("/Library/Fonts/NanumGothic.ttf"),
23
+ Path("/usr/share/fonts/truetype/nanum/NanumGothic.ttf"),
24
+ Path("C:/Windows/Fonts/malgun.ttf"),
25
+ ]
26
+ for p in candidates:
27
+ if p.exists():
28
+ return p
29
+ return None
30
+
31
+
32
+ def _make_color_func(color_lookup: dict[str, str], fallback: str) -> callable:
33
+ """키워드별 색상 매핑 함수. 매핑에 없는 단어는 fallback 색상을 사용한다."""
34
+ def color_func(word: str, **kwargs) -> str:
35
+ return color_lookup.get(word, fallback)
36
+ return color_func
37
+
38
+
39
+ def _make_region_color_func(color: str) -> callable:
40
+ """영역 전체를 단일 색상으로 칠하는 함수를 반환한다."""
41
+ return lambda word, **kwargs: color
42
+
43
+
44
+ def render(
45
+ left: Counter,
46
+ center: Counter,
47
+ right: Counter,
48
+ left_label: str,
49
+ right_label: str,
50
+ word_colors: dict[str, str] | None,
51
+ colormap: str | None,
52
+ left_border_color: str,
53
+ right_border_color: str,
54
+ left_word_color: str,
55
+ right_word_color: str,
56
+ center_word_color: str,
57
+ font_path: Path | None,
58
+ quality_scale: int,
59
+ ) -> Image.Image:
60
+ """세 Counter를 받아 듀얼 워드클라우드 PIL Image를 반환한다."""
61
+ if font_path is None:
62
+ font_path = _resolve_font_path()
63
+
64
+ scale = max(1, min(int(quality_scale), 3))
65
+ border_colors = [left_border_color, right_border_color]
66
+ word_colors_by_region = [left_word_color, right_word_color, center_word_color]
67
+
68
+ if not any([left, center, right]):
69
+ raise ValueError("left, center, right 중 하나 이상은 비어있지 않아야 합니다")
70
+
71
+ global_max = max(
72
+ max(left.values(), default=0),
73
+ max(center.values(), default=0),
74
+ max(right.values(), default=0),
75
+ ) or 1
76
+ left = Counter({k: v / global_max for k, v in left.items()})
77
+ center = Counter({k: v / global_max for k, v in center.items()})
78
+ right = Counter({k: v / global_max for k, v in right.items()})
79
+
80
+ fig_width, fig_height, dpi = 25, 15, 300
81
+
82
+ rc_params = {"axes.unicode_minus": False}
83
+ if font_path:
84
+ font_prop = fm.FontProperties(fname=str(font_path))
85
+ rc_params["font.family"] = font_prop.get_name()
86
+
87
+ with matplotlib.rc_context(rc_params):
88
+ fig, ax = plt.subplots(figsize=(fig_width, fig_height), dpi=dpi)
89
+ venn = venn2(subsets=(10, 10, 5), set_labels=(left_label, right_label), ax=ax)
90
+
91
+ subset_data = {
92
+ frozenset({0}): left,
93
+ frozenset({1}): right,
94
+ frozenset({0, 1}): center,
95
+ }
96
+ subset_keys = [frozenset({0}), frozenset({1}), frozenset({0, 1})]
97
+
98
+ for idx, patch in enumerate(venn.patches):
99
+ if patch is None:
100
+ continue
101
+ patch.set_facecolor("none")
102
+ if idx < len(border_colors):
103
+ patch.set_edgecolor(border_colors[idx])
104
+ patch.set_linewidth(2)
105
+ else:
106
+ patch.set_linewidth(0)
107
+
108
+ for label in venn.subset_labels:
109
+ if label:
110
+ label.set_visible(False)
111
+
112
+ for idx, patch in enumerate(venn.patches):
113
+ if idx >= len(subset_keys):
114
+ continue
115
+ counter = subset_data.get(subset_keys[idx], Counter())
116
+ if not counter:
117
+ continue
118
+
119
+ path = patch.get_path()
120
+ if path is None:
121
+ continue
122
+ vertices = path.vertices
123
+ if len(vertices) < 3:
124
+ continue
125
+
126
+ polygon = geom.Polygon(vertices)
127
+ min_x, min_y, max_x, max_y = polygon.bounds
128
+ width = max(100, min(800, int((max_x - min_x) * dpi * scale)))
129
+ height = max(100, min(600, int((max_y - min_y) * dpi * scale)))
130
+
131
+ img_mask = Image.new("L", (width, height), 255)
132
+ draw = ImageDraw.Draw(img_mask)
133
+ x_range = max_x - min_x if max_x > min_x else 1
134
+ y_range = max_y - min_y if max_y > min_y else 1
135
+ px_vertices = [
136
+ (int((x - min_x) / x_range * (width - 1)),
137
+ int((max_y - y) / y_range * (height - 1)))
138
+ for x, y in vertices
139
+ ]
140
+ draw.polygon(px_vertices, fill=0)
141
+ mask = np.array(img_mask)
142
+
143
+ region_word_color = word_colors_by_region[idx] if idx < len(word_colors_by_region) else "#95a5a6"
144
+ wc_kwargs: dict = {}
145
+ if word_colors:
146
+ region_lookup = {k: v for k, v in word_colors.items() if k in counter}
147
+ wc_kwargs["color_func"] = _make_color_func(region_lookup, fallback=region_word_color)
148
+ elif colormap:
149
+ wc_kwargs["colormap"] = colormap
150
+ else:
151
+ wc_kwargs["color_func"] = _make_region_color_func(region_word_color)
152
+
153
+ wc = WordCloud(
154
+ font_path=str(font_path) if font_path else None,
155
+ width=width, height=height, mask=mask,
156
+ mode="RGBA", background_color=None,
157
+ prefer_horizontal=0.7, min_font_size=8,
158
+ max_font_size=int(48 * scale),
159
+ **wc_kwargs,
160
+ )
161
+ wc.generate_from_frequencies(counter)
162
+ ax.imshow(np.array(wc), extent=[min_x, max_x, min_y, max_y],
163
+ aspect="auto", alpha=0.9, zorder=10)
164
+
165
+ buf = io.BytesIO()
166
+ fig.savefig(buf, format="png", dpi=dpi, bbox_inches="tight",
167
+ pad_inches=0.1, transparent=False, facecolor="white")
168
+ plt.close(fig)
169
+
170
+ buf.seek(0)
171
+ return Image.open(buf).copy()
@@ -0,0 +1,39 @@
1
+ from collections import Counter
2
+
3
+
4
+ def split_for_comparison(
5
+ a: Counter,
6
+ b: Counter,
7
+ count_a: int,
8
+ count_b: int,
9
+ ratio_threshold: float = 2.0,
10
+ ) -> tuple[Counter, Counter, Counter]:
11
+ """두 Counter를 정규화 빈도 기반으로 left/center/right로 분리한다.
12
+
13
+ 각 키워드를 기사 수로 정규화한 뒤 비율 임계값으로 배치를 결정한다:
14
+ - 한쪽에만 있는 단어: 해당 side (정규화 빈도)
15
+ - norm_a / norm_b > ratio_threshold: left (norm_a)
16
+ - norm_b / norm_a > ratio_threshold: right (norm_b)
17
+ - 그 외 (비슷한 빈도): center ((norm_a + norm_b) / 2)
18
+ """
19
+ all_keys = set(a.keys()) | set(b.keys())
20
+ left: Counter = Counter()
21
+ center: Counter = Counter()
22
+ right: Counter = Counter()
23
+
24
+ for k in all_keys:
25
+ norm_a = a.get(k, 0) / count_a
26
+ norm_b = b.get(k, 0) / count_b
27
+
28
+ if norm_b == 0:
29
+ left[k] = norm_a
30
+ elif norm_a == 0:
31
+ right[k] = norm_b
32
+ elif norm_a / norm_b > ratio_threshold:
33
+ left[k] = norm_a
34
+ elif norm_b / norm_a > ratio_threshold:
35
+ right[k] = norm_b
36
+ else:
37
+ center[k] = (norm_a + norm_b) / 2
38
+
39
+ return left, center, right