pystylometry 1.3.1__py3-none-any.whl → 1.3.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pystylometry/__init__.py +42 -3
- pystylometry/_types.py +53 -3
- pystylometry/cli.py +695 -0
- pystylometry/lexical/__init__.py +4 -1
- pystylometry/lexical/bnc_frequency.py +309 -0
- pystylometry/lexical/ttr.py +288 -97
- pystylometry/viz/jsx/__init__.py +2 -0
- pystylometry/viz/jsx/bnc_frequency.py +495 -0
- {pystylometry-1.3.1.dist-info → pystylometry-1.3.6.dist-info}/METADATA +16 -3
- {pystylometry-1.3.1.dist-info → pystylometry-1.3.6.dist-info}/RECORD +13 -11
- {pystylometry-1.3.1.dist-info → pystylometry-1.3.6.dist-info}/entry_points.txt +2 -0
- {pystylometry-1.3.1.dist-info → pystylometry-1.3.6.dist-info}/LICENSE +0 -0
- {pystylometry-1.3.1.dist-info → pystylometry-1.3.6.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,495 @@
|
|
|
1
|
+
"""Interactive HTML export for BNC frequency analysis.
|
|
2
|
+
|
|
3
|
+
This module generates a self-contained HTML report showing word frequency
|
|
4
|
+
comparisons against the British National Corpus (BNC).
|
|
5
|
+
|
|
6
|
+
The report has three sections:
|
|
7
|
+
1. Not in BNC - Words not found in the corpus (with WordNet and character type info)
|
|
8
|
+
2. Most Underused - Words appearing less frequently than expected
|
|
9
|
+
3. Most Overused - Words appearing more frequently than expected
|
|
10
|
+
|
|
11
|
+
Related GitHub Issue:
|
|
12
|
+
#TBD - BNC frequency analysis CLI
|
|
13
|
+
https://github.com/craigtrim/pystylometry/issues/TBD
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
from typing import TYPE_CHECKING
|
|
20
|
+
|
|
21
|
+
from ._base import CARD_STYLES, generate_html_document, write_html_file
|
|
22
|
+
|
|
23
|
+
if TYPE_CHECKING:
|
|
24
|
+
from pystylometry.lexical.bnc_frequency import BNCFrequencyResult
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def export_bnc_frequency_jsx(
|
|
28
|
+
result: "BNCFrequencyResult",
|
|
29
|
+
output_file: str | Path,
|
|
30
|
+
title: str = "BNC Word Frequency Analysis",
|
|
31
|
+
source_file: str | None = None,
|
|
32
|
+
) -> Path:
|
|
33
|
+
"""Export BNC frequency analysis as interactive HTML.
|
|
34
|
+
|
|
35
|
+
Generates a self-contained HTML file with three sections:
|
|
36
|
+
- Not in BNC: Complete table with WordNet status and character type
|
|
37
|
+
- Most Underused: Words below the underuse threshold
|
|
38
|
+
- Most Overused: Words above the overuse threshold
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
result: BNCFrequencyResult from compute_bnc_frequency()
|
|
42
|
+
output_file: Path to write the HTML file
|
|
43
|
+
title: Page title (default: "BNC Word Frequency Analysis")
|
|
44
|
+
source_file: Optional source filename to display
|
|
45
|
+
|
|
46
|
+
Returns:
|
|
47
|
+
Path to the written HTML file
|
|
48
|
+
|
|
49
|
+
Example:
|
|
50
|
+
>>> from pystylometry.lexical.bnc_frequency import compute_bnc_frequency
|
|
51
|
+
>>> from pystylometry.viz.jsx import export_bnc_frequency_jsx
|
|
52
|
+
>>> result = compute_bnc_frequency(text)
|
|
53
|
+
>>> export_bnc_frequency_jsx(result, "frequency_report.html")
|
|
54
|
+
"""
|
|
55
|
+
# Build data for the React component
|
|
56
|
+
not_in_bnc_data = [
|
|
57
|
+
{
|
|
58
|
+
"word": w.word,
|
|
59
|
+
"observed": w.observed,
|
|
60
|
+
"inWordnet": w.in_wordnet,
|
|
61
|
+
"charType": w.char_type,
|
|
62
|
+
}
|
|
63
|
+
for w in result.not_in_bnc
|
|
64
|
+
]
|
|
65
|
+
|
|
66
|
+
underused_data = [
|
|
67
|
+
{
|
|
68
|
+
"word": w.word,
|
|
69
|
+
"observed": w.observed,
|
|
70
|
+
"expected": round(w.expected, 2) if w.expected else None,
|
|
71
|
+
"ratio": round(w.ratio, 4) if w.ratio else None,
|
|
72
|
+
"charType": w.char_type,
|
|
73
|
+
}
|
|
74
|
+
for w in result.underused
|
|
75
|
+
]
|
|
76
|
+
|
|
77
|
+
overused_data = [
|
|
78
|
+
{
|
|
79
|
+
"word": w.word,
|
|
80
|
+
"observed": w.observed,
|
|
81
|
+
"expected": round(w.expected, 2) if w.expected else None,
|
|
82
|
+
"ratio": round(w.ratio, 1) if w.ratio else None,
|
|
83
|
+
"charType": w.char_type,
|
|
84
|
+
}
|
|
85
|
+
for w in result.overused
|
|
86
|
+
]
|
|
87
|
+
|
|
88
|
+
config = {
|
|
89
|
+
"title": title,
|
|
90
|
+
"sourceFile": source_file,
|
|
91
|
+
"notInBnc": not_in_bnc_data,
|
|
92
|
+
"underused": underused_data,
|
|
93
|
+
"overused": overused_data,
|
|
94
|
+
"stats": {
|
|
95
|
+
"totalTokens": result.total_tokens,
|
|
96
|
+
"uniqueTokens": result.unique_tokens,
|
|
97
|
+
"notInBncCount": len(result.not_in_bnc),
|
|
98
|
+
"underusedCount": len(result.underused),
|
|
99
|
+
"overusedCount": len(result.overused),
|
|
100
|
+
"overuseThreshold": result.overuse_threshold,
|
|
101
|
+
"underuseThreshold": result.underuse_threshold,
|
|
102
|
+
},
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
react_component = """
|
|
106
|
+
// Color mapping for character types
|
|
107
|
+
const CHAR_TYPE_COLORS = {
|
|
108
|
+
latin: { bg: '#dcfce7', text: '#166534', label: 'Latin' },
|
|
109
|
+
unicode: { bg: '#fef3c7', text: '#92400e', label: 'Unicode' },
|
|
110
|
+
numeric: { bg: '#dbeafe', text: '#1e40af', label: 'Numeric' },
|
|
111
|
+
mixed: { bg: '#f3e8ff', text: '#6b21a8', label: 'Mixed' },
|
|
112
|
+
punctuation: { bg: '#f1f5f9', text: '#475569', label: 'Punct' },
|
|
113
|
+
};
|
|
114
|
+
|
|
115
|
+
// Tab configuration
|
|
116
|
+
const TABS = [
|
|
117
|
+
{ id: 'overused', label: 'Most Overused', color: '#ef4444' },
|
|
118
|
+
{ id: 'underused', label: 'Most Underused', color: '#3b82f6' },
|
|
119
|
+
{ id: 'notInBnc', label: 'Not in BNC', color: '#6b7280' },
|
|
120
|
+
];
|
|
121
|
+
|
|
122
|
+
// WordNet status badge
|
|
123
|
+
function WordnetBadge({ inWordnet }) {
|
|
124
|
+
if (inWordnet === null || inWordnet === undefined) {
|
|
125
|
+
return <span style={{ color: '#9ca3af', fontSize: '12px' }}>—</span>;
|
|
126
|
+
}
|
|
127
|
+
return inWordnet ? (
|
|
128
|
+
<span style={{
|
|
129
|
+
background: '#dcfce7',
|
|
130
|
+
color: '#166534',
|
|
131
|
+
padding: '2px 8px',
|
|
132
|
+
borderRadius: '9999px',
|
|
133
|
+
fontSize: '11px',
|
|
134
|
+
fontWeight: 500,
|
|
135
|
+
}}>Yes</span>
|
|
136
|
+
) : (
|
|
137
|
+
<span style={{
|
|
138
|
+
background: '#fee2e2',
|
|
139
|
+
color: '#991b1b',
|
|
140
|
+
padding: '2px 8px',
|
|
141
|
+
borderRadius: '9999px',
|
|
142
|
+
fontSize: '11px',
|
|
143
|
+
fontWeight: 500,
|
|
144
|
+
}}>No</span>
|
|
145
|
+
);
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
// Character type badge
|
|
149
|
+
function CharTypeBadge({ charType }) {
|
|
150
|
+
const config = CHAR_TYPE_COLORS[charType] || CHAR_TYPE_COLORS.mixed;
|
|
151
|
+
return (
|
|
152
|
+
<span style={{
|
|
153
|
+
background: config.bg,
|
|
154
|
+
color: config.text,
|
|
155
|
+
padding: '2px 8px',
|
|
156
|
+
borderRadius: '9999px',
|
|
157
|
+
fontSize: '11px',
|
|
158
|
+
fontWeight: 500,
|
|
159
|
+
}}>{config.label}</span>
|
|
160
|
+
);
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
// Ratio display with color intensity
|
|
164
|
+
function RatioDisplay({ ratio, isOverused }) {
|
|
165
|
+
if (ratio === null || ratio === undefined) return '—';
|
|
166
|
+
|
|
167
|
+
let color, intensity;
|
|
168
|
+
if (isOverused) {
|
|
169
|
+
intensity = Math.min(Math.log2(ratio) / 6, 1);
|
|
170
|
+
const r = 239;
|
|
171
|
+
const g = Math.round(68 + (1 - intensity) * 120);
|
|
172
|
+
color = `rgb(${r}, ${g}, 68)`;
|
|
173
|
+
} else {
|
|
174
|
+
intensity = Math.min(Math.abs(Math.log2(ratio)) / 4, 1);
|
|
175
|
+
const b = 246;
|
|
176
|
+
const g = Math.round(130 + (1 - intensity) * 60);
|
|
177
|
+
color = `rgb(59, ${g}, ${b})`;
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
const displayValue = isOverused ? ratio.toFixed(1) + 'x' : ratio.toFixed(4);
|
|
181
|
+
|
|
182
|
+
return (
|
|
183
|
+
<span style={{
|
|
184
|
+
color: color,
|
|
185
|
+
fontWeight: 600,
|
|
186
|
+
fontFamily: 'ui-monospace, monospace',
|
|
187
|
+
}}>{displayValue}</span>
|
|
188
|
+
);
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
// Stats summary card
|
|
192
|
+
function StatsCard({ stats, activeTab, onTabChange }) {
|
|
193
|
+
return (
|
|
194
|
+
<div className="card" style={{ marginBottom: '24px' }}>
|
|
195
|
+
<div style={{ display: 'grid', gridTemplateColumns: 'repeat(auto-fit, minmax(120px, 1fr))', gap: '16px' }}>
|
|
196
|
+
<div>
|
|
197
|
+
<div style={{ fontSize: '11px', color: '#6b7280', marginBottom: '4px' }}>Total Tokens</div>
|
|
198
|
+
<div style={{ fontSize: '20px', fontWeight: 600 }}>{stats.totalTokens.toLocaleString()}</div>
|
|
199
|
+
</div>
|
|
200
|
+
<div>
|
|
201
|
+
<div style={{ fontSize: '11px', color: '#6b7280', marginBottom: '4px' }}>Unique Words</div>
|
|
202
|
+
<div style={{ fontSize: '20px', fontWeight: 600 }}>{stats.uniqueTokens.toLocaleString()}</div>
|
|
203
|
+
</div>
|
|
204
|
+
{TABS.map(tab => (
|
|
205
|
+
<div
|
|
206
|
+
key={tab.id}
|
|
207
|
+
onClick={() => onTabChange(tab.id)}
|
|
208
|
+
style={{
|
|
209
|
+
cursor: 'pointer',
|
|
210
|
+
padding: '8px',
|
|
211
|
+
margin: '-8px',
|
|
212
|
+
borderRadius: '8px',
|
|
213
|
+
background: activeTab === tab.id ? `${tab.color}10` : 'transparent',
|
|
214
|
+
border: activeTab === tab.id ? `2px solid ${tab.color}` : '2px solid transparent',
|
|
215
|
+
transition: 'all 0.15s',
|
|
216
|
+
}}
|
|
217
|
+
>
|
|
218
|
+
<div style={{ fontSize: '11px', color: '#6b7280', marginBottom: '4px' }}>{tab.label}</div>
|
|
219
|
+
<div style={{ fontSize: '20px', fontWeight: 600, color: tab.color }}>
|
|
220
|
+
{tab.id === 'overused' ? stats.overusedCount.toLocaleString() :
|
|
221
|
+
tab.id === 'underused' ? stats.underusedCount.toLocaleString() :
|
|
222
|
+
stats.notInBncCount.toLocaleString()}
|
|
223
|
+
</div>
|
|
224
|
+
</div>
|
|
225
|
+
))}
|
|
226
|
+
</div>
|
|
227
|
+
</div>
|
|
228
|
+
);
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
// Tab bar component
|
|
232
|
+
function TabBar({ activeTab, onTabChange, stats }) {
|
|
233
|
+
return (
|
|
234
|
+
<div style={{ display: 'flex', gap: '4px', marginBottom: '16px', borderBottom: '2px solid #e2e8f0', paddingBottom: '0' }}>
|
|
235
|
+
{TABS.map(tab => {
|
|
236
|
+
const count = tab.id === 'overused' ? stats.overusedCount :
|
|
237
|
+
tab.id === 'underused' ? stats.underusedCount :
|
|
238
|
+
stats.notInBncCount;
|
|
239
|
+
const isActive = activeTab === tab.id;
|
|
240
|
+
return (
|
|
241
|
+
<button
|
|
242
|
+
key={tab.id}
|
|
243
|
+
onClick={() => onTabChange(tab.id)}
|
|
244
|
+
style={{
|
|
245
|
+
padding: '12px 20px',
|
|
246
|
+
border: 'none',
|
|
247
|
+
background: 'transparent',
|
|
248
|
+
cursor: 'pointer',
|
|
249
|
+
fontSize: '14px',
|
|
250
|
+
fontWeight: isActive ? 600 : 500,
|
|
251
|
+
color: isActive ? tab.color : '#6b7280',
|
|
252
|
+
borderBottom: isActive ? `3px solid ${tab.color}` : '3px solid transparent',
|
|
253
|
+
marginBottom: '-2px',
|
|
254
|
+
transition: 'all 0.15s',
|
|
255
|
+
}}
|
|
256
|
+
>
|
|
257
|
+
{tab.label}
|
|
258
|
+
<span style={{
|
|
259
|
+
marginLeft: '8px',
|
|
260
|
+
padding: '2px 8px',
|
|
261
|
+
borderRadius: '9999px',
|
|
262
|
+
fontSize: '12px',
|
|
263
|
+
background: isActive ? `${tab.color}20` : '#f1f5f9',
|
|
264
|
+
color: isActive ? tab.color : '#6b7280',
|
|
265
|
+
}}>{count.toLocaleString()}</span>
|
|
266
|
+
</button>
|
|
267
|
+
);
|
|
268
|
+
})}
|
|
269
|
+
</div>
|
|
270
|
+
);
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
// Data table component
|
|
274
|
+
function DataTable({ data, columns, emptyMessage, filter, onFilterChange }) {
|
|
275
|
+
const [sortKey, setSortKey] = React.useState(null);
|
|
276
|
+
const [sortDir, setSortDir] = React.useState('desc');
|
|
277
|
+
|
|
278
|
+
const filteredData = React.useMemo(() => {
|
|
279
|
+
if (!filter) return data;
|
|
280
|
+
const lowerFilter = filter.toLowerCase();
|
|
281
|
+
return data.filter(row => row.word.toLowerCase().includes(lowerFilter));
|
|
282
|
+
}, [data, filter]);
|
|
283
|
+
|
|
284
|
+
const sortedData = React.useMemo(() => {
|
|
285
|
+
if (!sortKey) return filteredData;
|
|
286
|
+
return [...filteredData].sort((a, b) => {
|
|
287
|
+
let aVal = a[sortKey];
|
|
288
|
+
let bVal = b[sortKey];
|
|
289
|
+
if (aVal === null || aVal === undefined) aVal = sortDir === 'desc' ? -Infinity : Infinity;
|
|
290
|
+
if (bVal === null || bVal === undefined) bVal = sortDir === 'desc' ? -Infinity : Infinity;
|
|
291
|
+
if (typeof aVal === 'string') {
|
|
292
|
+
return sortDir === 'desc' ? bVal.localeCompare(aVal) : aVal.localeCompare(bVal);
|
|
293
|
+
}
|
|
294
|
+
return sortDir === 'desc' ? bVal - aVal : aVal - bVal;
|
|
295
|
+
});
|
|
296
|
+
}, [filteredData, sortKey, sortDir]);
|
|
297
|
+
|
|
298
|
+
const handleSort = (key) => {
|
|
299
|
+
if (sortKey === key) {
|
|
300
|
+
setSortDir(sortDir === 'desc' ? 'asc' : 'desc');
|
|
301
|
+
} else {
|
|
302
|
+
setSortKey(key);
|
|
303
|
+
setSortDir('desc');
|
|
304
|
+
}
|
|
305
|
+
};
|
|
306
|
+
|
|
307
|
+
return (
|
|
308
|
+
<div>
|
|
309
|
+
<div style={{ display: 'flex', justifyContent: 'space-between', alignItems: 'center', marginBottom: '12px' }}>
|
|
310
|
+
<span style={{ fontSize: '13px', color: '#6b7280' }}>{sortedData.length} words</span>
|
|
311
|
+
<input
|
|
312
|
+
type="text"
|
|
313
|
+
placeholder="Filter words..."
|
|
314
|
+
value={filter}
|
|
315
|
+
onChange={(e) => onFilterChange(e.target.value)}
|
|
316
|
+
style={{
|
|
317
|
+
padding: '8px 12px',
|
|
318
|
+
border: '1px solid #e2e8f0',
|
|
319
|
+
borderRadius: '6px',
|
|
320
|
+
fontSize: '13px',
|
|
321
|
+
width: '200px',
|
|
322
|
+
}}
|
|
323
|
+
/>
|
|
324
|
+
</div>
|
|
325
|
+
|
|
326
|
+
{sortedData.length === 0 ? (
|
|
327
|
+
<div style={{ padding: '48px', textAlign: 'center', color: '#9ca3af' }}>
|
|
328
|
+
{filter ? 'No matching words' : emptyMessage}
|
|
329
|
+
</div>
|
|
330
|
+
) : (
|
|
331
|
+
<div style={{ overflowX: 'auto', maxHeight: '600px', overflowY: 'auto' }}>
|
|
332
|
+
<table style={{ width: '100%', borderCollapse: 'collapse', fontSize: '13px' }}>
|
|
333
|
+
<thead style={{ position: 'sticky', top: 0, background: 'white' }}>
|
|
334
|
+
<tr style={{ borderBottom: '2px solid #e2e8f0' }}>
|
|
335
|
+
{columns.map(col => (
|
|
336
|
+
<th
|
|
337
|
+
key={col.key}
|
|
338
|
+
onClick={() => col.sortable !== false && handleSort(col.key)}
|
|
339
|
+
style={{
|
|
340
|
+
textAlign: col.align || 'left',
|
|
341
|
+
padding: '10px 12px',
|
|
342
|
+
fontWeight: 600,
|
|
343
|
+
color: '#374151',
|
|
344
|
+
cursor: col.sortable !== false ? 'pointer' : 'default',
|
|
345
|
+
userSelect: 'none',
|
|
346
|
+
whiteSpace: 'nowrap',
|
|
347
|
+
background: 'white',
|
|
348
|
+
}}
|
|
349
|
+
>
|
|
350
|
+
{col.label}
|
|
351
|
+
{sortKey === col.key && (
|
|
352
|
+
<span style={{ marginLeft: '4px' }}>{sortDir === 'desc' ? '↓' : '↑'}</span>
|
|
353
|
+
)}
|
|
354
|
+
</th>
|
|
355
|
+
))}
|
|
356
|
+
</tr>
|
|
357
|
+
</thead>
|
|
358
|
+
<tbody>
|
|
359
|
+
{sortedData.map((row, idx) => (
|
|
360
|
+
<tr key={idx} style={{ borderBottom: '1px solid #f1f5f9' }}>
|
|
361
|
+
{columns.map(col => (
|
|
362
|
+
<td key={col.key} style={{ padding: '10px 12px', textAlign: col.align || 'left' }}>
|
|
363
|
+
{col.render ? col.render(row[col.key], row) : row[col.key]}
|
|
364
|
+
</td>
|
|
365
|
+
))}
|
|
366
|
+
</tr>
|
|
367
|
+
))}
|
|
368
|
+
</tbody>
|
|
369
|
+
</table>
|
|
370
|
+
</div>
|
|
371
|
+
)}
|
|
372
|
+
</div>
|
|
373
|
+
);
|
|
374
|
+
}
|
|
375
|
+
|
|
376
|
+
// Main component
|
|
377
|
+
function BNCFrequencyReport() {
|
|
378
|
+
const { title, sourceFile, notInBnc, underused, overused, stats } = CONFIG;
|
|
379
|
+
const [activeTab, setActiveTab] = React.useState('overused');
|
|
380
|
+
const [filter, setFilter] = React.useState('');
|
|
381
|
+
|
|
382
|
+
// Reset filter when tab changes
|
|
383
|
+
const handleTabChange = (tab) => {
|
|
384
|
+
setActiveTab(tab);
|
|
385
|
+
setFilter('');
|
|
386
|
+
};
|
|
387
|
+
|
|
388
|
+
// Column definitions
|
|
389
|
+
const notInBncColumns = [
|
|
390
|
+
{ key: 'word', label: 'Word', render: (v) => <code style={{ background: '#f1f5f9', padding: '2px 6px', borderRadius: '4px' }}>{v}</code> },
|
|
391
|
+
{ key: 'observed', label: 'Mentions', align: 'right' },
|
|
392
|
+
{ key: 'inWordnet', label: 'In WordNet', align: 'center', render: (v) => <WordnetBadge inWordnet={v} />, sortable: false },
|
|
393
|
+
{ key: 'charType', label: 'Char Type', align: 'center', render: (v) => <CharTypeBadge charType={v} />, sortable: false },
|
|
394
|
+
];
|
|
395
|
+
|
|
396
|
+
const frequencyColumns = (isOverused) => [
|
|
397
|
+
{ key: 'word', label: 'Word', render: (v) => <code style={{ background: '#f1f5f9', padding: '2px 6px', borderRadius: '4px' }}>{v}</code> },
|
|
398
|
+
{ key: 'observed', label: 'Observed', align: 'right' },
|
|
399
|
+
{ key: 'expected', label: 'Expected', align: 'right', render: (v) => v !== null ? v.toFixed(2) : '—' },
|
|
400
|
+
{ key: 'ratio', label: 'Ratio', align: 'right', render: (v) => <RatioDisplay ratio={v} isOverused={isOverused} /> },
|
|
401
|
+
{ key: 'charType', label: 'Char Type', align: 'center', render: (v) => <CharTypeBadge charType={v} />, sortable: false },
|
|
402
|
+
];
|
|
403
|
+
|
|
404
|
+
const getTabContent = () => {
|
|
405
|
+
switch (activeTab) {
|
|
406
|
+
case 'overused':
|
|
407
|
+
return (
|
|
408
|
+
<DataTable
|
|
409
|
+
data={overused}
|
|
410
|
+
columns={frequencyColumns(true)}
|
|
411
|
+
emptyMessage="No significantly overused words"
|
|
412
|
+
filter={filter}
|
|
413
|
+
onFilterChange={setFilter}
|
|
414
|
+
/>
|
|
415
|
+
);
|
|
416
|
+
case 'underused':
|
|
417
|
+
return (
|
|
418
|
+
<DataTable
|
|
419
|
+
data={underused}
|
|
420
|
+
columns={frequencyColumns(false)}
|
|
421
|
+
emptyMessage="No significantly underused words"
|
|
422
|
+
filter={filter}
|
|
423
|
+
onFilterChange={setFilter}
|
|
424
|
+
/>
|
|
425
|
+
);
|
|
426
|
+
case 'notInBnc':
|
|
427
|
+
return (
|
|
428
|
+
<DataTable
|
|
429
|
+
data={notInBnc}
|
|
430
|
+
columns={notInBncColumns}
|
|
431
|
+
emptyMessage="All words found in BNC"
|
|
432
|
+
filter={filter}
|
|
433
|
+
onFilterChange={setFilter}
|
|
434
|
+
/>
|
|
435
|
+
);
|
|
436
|
+
}
|
|
437
|
+
};
|
|
438
|
+
|
|
439
|
+
return (
|
|
440
|
+
<div>
|
|
441
|
+
<div style={{ marginBottom: '24px' }}>
|
|
442
|
+
<h1 style={{ margin: '0 0 8px', fontSize: '24px', fontWeight: 600 }}>{title}</h1>
|
|
443
|
+
{sourceFile && (
|
|
444
|
+
<div style={{ fontSize: '14px', color: '#6b7280' }}>
|
|
445
|
+
Source: <code style={{ background: '#f1f5f9', padding: '2px 6px', borderRadius: '4px' }}>{sourceFile}</code>
|
|
446
|
+
</div>
|
|
447
|
+
)}
|
|
448
|
+
</div>
|
|
449
|
+
|
|
450
|
+
<StatsCard stats={stats} activeTab={activeTab} onTabChange={handleTabChange} />
|
|
451
|
+
|
|
452
|
+
<div className="card">
|
|
453
|
+
<TabBar activeTab={activeTab} onTabChange={handleTabChange} stats={stats} />
|
|
454
|
+
{getTabContent()}
|
|
455
|
+
</div>
|
|
456
|
+
|
|
457
|
+
<div style={{ marginTop: '24px', padding: '16px', background: '#f8fafc', borderRadius: '8px', fontSize: '12px', color: '#6b7280' }}>
|
|
458
|
+
<strong>About this analysis:</strong> Word frequencies are compared against the British National Corpus (BNC),
|
|
459
|
+
a 100-million word collection of British English. Ratios indicate how much more (or less) frequently
|
|
460
|
+
a word appears in this text compared to typical usage. Words not in BNC may be proper nouns,
|
|
461
|
+
technical terms, neologisms, or OCR errors.
|
|
462
|
+
</div>
|
|
463
|
+
</div>
|
|
464
|
+
);
|
|
465
|
+
}
|
|
466
|
+
"""
|
|
467
|
+
|
|
468
|
+
extra_styles = (
|
|
469
|
+
CARD_STYLES
|
|
470
|
+
+ """
|
|
471
|
+
code {
|
|
472
|
+
font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, monospace;
|
|
473
|
+
}
|
|
474
|
+
table {
|
|
475
|
+
font-variant-numeric: tabular-nums;
|
|
476
|
+
}
|
|
477
|
+
input:focus {
|
|
478
|
+
outline: 2px solid #3b82f6;
|
|
479
|
+
outline-offset: -1px;
|
|
480
|
+
}
|
|
481
|
+
tr:hover {
|
|
482
|
+
background: #f8fafc;
|
|
483
|
+
}
|
|
484
|
+
"""
|
|
485
|
+
)
|
|
486
|
+
|
|
487
|
+
html = generate_html_document(
|
|
488
|
+
title=title,
|
|
489
|
+
config=config,
|
|
490
|
+
react_component=react_component,
|
|
491
|
+
component_name="BNCFrequencyReport",
|
|
492
|
+
extra_styles=extra_styles,
|
|
493
|
+
)
|
|
494
|
+
|
|
495
|
+
return write_html_file(output_file, html)
|
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: pystylometry
|
|
3
|
-
Version: 1.3.
|
|
3
|
+
Version: 1.3.6
|
|
4
4
|
Summary: Comprehensive Python package for stylometric analysis
|
|
5
5
|
License: MIT
|
|
6
6
|
Keywords: stylometry,nlp,text-analysis,authorship,readability,lexical-diversity,readability-metrics
|
|
7
7
|
Author: Craig Trim
|
|
8
8
|
Author-email: craigtrim@gmail.com
|
|
9
9
|
Requires-Python: >=3.9,<4.0
|
|
10
|
-
Classifier: Development Status ::
|
|
10
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
11
11
|
Classifier: Intended Audience :: Developers
|
|
12
12
|
Classifier: Intended Audience :: Science/Research
|
|
13
13
|
Classifier: License :: OSI Approved :: MIT License
|
|
@@ -19,7 +19,20 @@ Classifier: Programming Language :: Python :: 3.12
|
|
|
19
19
|
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
20
20
|
Classifier: Topic :: Text Processing :: Linguistic
|
|
21
21
|
Classifier: Typing :: Typed
|
|
22
|
-
|
|
22
|
+
Provides-Extra: all
|
|
23
|
+
Provides-Extra: excel
|
|
24
|
+
Provides-Extra: lexical
|
|
25
|
+
Provides-Extra: readability
|
|
26
|
+
Provides-Extra: syntactic
|
|
27
|
+
Provides-Extra: viz
|
|
28
|
+
Requires-Dist: bnc-lookup (>=1.3.2) ; extra == "lexical" or extra == "all"
|
|
29
|
+
Requires-Dist: matplotlib (>=3.8.0,<4.0.0) ; extra == "viz" or extra == "all"
|
|
30
|
+
Requires-Dist: openpyxl (>=3.1.0,<4.0.0) ; extra == "lexical" or extra == "excel" or extra == "all"
|
|
31
|
+
Requires-Dist: pronouncing (>=0.2.0,<0.3.0) ; extra == "readability" or extra == "all"
|
|
32
|
+
Requires-Dist: rich (>=13.0,<14.0)
|
|
33
|
+
Requires-Dist: seaborn (>=0.13.0,<0.14.0) ; extra == "viz" or extra == "all"
|
|
34
|
+
Requires-Dist: spacy (>=3.8.0,<4.0.0) ; extra == "readability" or extra == "syntactic" or extra == "all"
|
|
35
|
+
Requires-Dist: wordnet-lookup ; extra == "lexical" or extra == "all"
|
|
23
36
|
Project-URL: Homepage, https://github.com/craigtrim/pystylometry
|
|
24
37
|
Project-URL: Issues, https://github.com/craigtrim/pystylometry/issues
|
|
25
38
|
Project-URL: Repository, https://github.com/craigtrim/pystylometry
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
pystylometry/README.md,sha256=WFOtCAF3qtDTgGG3a_jTjNSwVgpQEXI1PKqbVBfyo1M,2366
|
|
2
|
-
pystylometry/__init__.py,sha256=
|
|
2
|
+
pystylometry/__init__.py,sha256=bZ8xk66Mx7gj3K_I6594DoqATIGv1FtLFSJmF6Dz1g4,10462
|
|
3
3
|
pystylometry/_normalize.py,sha256=7tdfgAKg5CI2d4eoDypmFqOVByoxpwgUUZD6vyBH86A,8679
|
|
4
|
-
pystylometry/_types.py,sha256=
|
|
4
|
+
pystylometry/_types.py,sha256=_YCkVyvHulmKkvmjzb73dcCOWJwiJZVhkV7sJcMr4YY,83618
|
|
5
5
|
pystylometry/_utils.py,sha256=CXTx4KDJ_6iiHcc2OXqOYs-izhLf_ZEmJFKdHyd7q34,5282
|
|
6
6
|
pystylometry/authorship/README.md,sha256=zNXCpLj7nczPnYykJnCUw3y-kxfC9mWZmngi3nfw6us,1016
|
|
7
7
|
pystylometry/authorship/__init__.py,sha256=D7m38hWi_62o1ZDSrghLCfob9YsykTht4K37wiVgHfg,1530
|
|
@@ -13,7 +13,7 @@ pystylometry/authorship/zeta.py,sha256=oOi9Y6ZPq15ILLVl6So9O9ERvzig26en6_dpQJWeo
|
|
|
13
13
|
pystylometry/character/README.md,sha256=poQwhbI8MabVD_626CWjEL87IOX5YDGS0ZJTH1hNwEE,607
|
|
14
14
|
pystylometry/character/__init__.py,sha256=CiiKJmZ10UJE8qAecavpOKyw-vGonsOew_mFH34ZOC0,371
|
|
15
15
|
pystylometry/character/character_metrics.py,sha256=OCIGP_ivtwtzcifcxcbmp2R5SIKh2tKyvKcHAv64S8g,14029
|
|
16
|
-
pystylometry/cli.py,sha256=
|
|
16
|
+
pystylometry/cli.py,sha256=NRKuA4oCEJPNPkeSUttZxd0ZVQSn4kh77qOTWfjsgyM,40635
|
|
17
17
|
pystylometry/consistency/README.md,sha256=HG_Rd6WRBnIz3M7J11dVDv1S2ARkMABFYrTn-VV8xRY,1058
|
|
18
18
|
pystylometry/consistency/__init__.py,sha256=l7nzpS7M4yHDBbM2LGAtW0XGT2n7YjSey_1xKf45224,2181
|
|
19
19
|
pystylometry/consistency/_thresholds.py,sha256=5fZwdJ_cnDy0ED7CCYs6V_zP6kIAR1p0h0NYkbZ0HRg,6381
|
|
@@ -24,13 +24,14 @@ pystylometry/dialect/_data/dialect_markers.json,sha256=DthluOA6q0rG_8IrCrFIYWh_E
|
|
|
24
24
|
pystylometry/dialect/_loader.py,sha256=M2ATp-5754v_yX9EWvBP0r5qgNf8xlL8XadVsVb_Hco,12989
|
|
25
25
|
pystylometry/dialect/detector.py,sha256=9x0ZuIfTIjsmdNSx0Ezy5AC0SAFtC4kVw11iOSBd9gQ,20147
|
|
26
26
|
pystylometry/lexical/README.md,sha256=cFQ7KRZV4ubsQwIlOH3YHTbhhNl5X91Sr3zcn-3x0HI,1185
|
|
27
|
-
pystylometry/lexical/__init__.py,sha256=
|
|
27
|
+
pystylometry/lexical/__init__.py,sha256=p5vYmHSr_kUHC2Vpng8ObncLs10cdb6s3P23DPmwzoc,1012
|
|
28
28
|
pystylometry/lexical/advanced_diversity.py,sha256=rL1hlNqTnaEFcA2v4oBJlojHZMTqdvvm4jYXTFGVpYE,25664
|
|
29
|
+
pystylometry/lexical/bnc_frequency.py,sha256=m_AEYY4joEwVVbzBYJm9zq2-K7Nix9MLE3l4LHomjig,10580
|
|
29
30
|
pystylometry/lexical/function_words.py,sha256=eel9bq_qWgWlvG0NtDiouilMt9kaFqz2rh3add2UC4U,17832
|
|
30
31
|
pystylometry/lexical/hapax.py,sha256=djTqZyZIYXa3GRiPoy6TTGHPm0wCRNJ9U0Rwnf5NoDk,12173
|
|
31
32
|
pystylometry/lexical/mtld.py,sha256=XpeCF8sOXZhWbaazHGuqm08mrOf_DYfkfGGAltWnyy4,7101
|
|
32
33
|
pystylometry/lexical/repetition.py,sha256=A9L0oNwfnCepVkWy57kjHV47Pw4M6fZXEl25hBVdq2s,18318
|
|
33
|
-
pystylometry/lexical/ttr.py,sha256=
|
|
34
|
+
pystylometry/lexical/ttr.py,sha256=igS8gnvIv57zvjQPtmIgkB5Wj7jdaKSMRpJ1WvMfKtw,13091
|
|
34
35
|
pystylometry/lexical/word_frequency_sophistication.py,sha256=OHOS0fBvd1Bz8zsJk-pJbWLTgImmBd-aewQnp_kq8BY,38828
|
|
35
36
|
pystylometry/lexical/yule.py,sha256=NXggha8jmQCu4i-qKZpISwyJBqNpuPHyVR86BLDLgio,5192
|
|
36
37
|
pystylometry/ngrams/README.md,sha256=50wyaWcLGbosLzTPR1cXdE_xAVU8jVY7fd3ReEk9KnY,802
|
|
@@ -66,13 +67,14 @@ pystylometry/tokenizer.py,sha256=03FEF4kKp72v-ypbtMg8u0WyVJGk3YJx6Nw3SGzyAnA,181
|
|
|
66
67
|
pystylometry/viz/README.md,sha256=mizuBpUzWgJqjC2u9C-Lu4sVDCcTQOgGsarRSkeWPf4,1031
|
|
67
68
|
pystylometry/viz/__init__.py,sha256=3kHMAcJJi8oPhTqUZIRdyf311cdyPOHWaJIUv-w0V04,2219
|
|
68
69
|
pystylometry/viz/drift.py,sha256=r98gQ4s_IlrEuaouxDMyue3cTjGqj10i4IeKC01IuCo,18956
|
|
69
|
-
pystylometry/viz/jsx/__init__.py,sha256=
|
|
70
|
+
pystylometry/viz/jsx/__init__.py,sha256=_-BFtPtBhQyBiKJWGPndI-m-3SRBk1JsFombYXYc2Fk,1191
|
|
70
71
|
pystylometry/viz/jsx/_base.py,sha256=nd7kEc13fUcRMom3A5jqjGyTy-djIeydq2k3oPHZIHY,3708
|
|
72
|
+
pystylometry/viz/jsx/bnc_frequency.py,sha256=U8plmMOXMgLuJPMtL5k5MecFAX-5CdnxSLX3mVAmoLY,18391
|
|
71
73
|
pystylometry/viz/jsx/report.py,sha256=DbbHnnNAEi5tmVg4PmiHb17vkBBXujyE4x1CfVBiOBw,25857
|
|
72
74
|
pystylometry/viz/jsx/timeline.py,sha256=hor-xnBa6oVkSqN0AEZUCQFBOB-iTfHSFZHiEfeakPA,30716
|
|
73
75
|
pystylometry/viz/jsx/viewer.py,sha256=3LO49d_2bRf_P-P-2oSKpKx4N8Ugo4oCLb3DtvyNxXI,43716
|
|
74
|
-
pystylometry-1.3.
|
|
75
|
-
pystylometry-1.3.
|
|
76
|
-
pystylometry-1.3.
|
|
77
|
-
pystylometry-1.3.
|
|
78
|
-
pystylometry-1.3.
|
|
76
|
+
pystylometry-1.3.6.dist-info/LICENSE,sha256=vou5JCLAT5nHcsUv-AkjUYAihYfN9mwPDXxV2DHyHBo,1067
|
|
77
|
+
pystylometry-1.3.6.dist-info/METADATA,sha256=No130TQB2VZMHPz6XD_Z37ZmyT0lC2Y785PSaDkeeZc,5545
|
|
78
|
+
pystylometry-1.3.6.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
|
79
|
+
pystylometry-1.3.6.dist-info/entry_points.txt,sha256=Gr2keJe638qHrrJpCGZAP3AYduxxIaSCoBH4FwAJt7U,204
|
|
80
|
+
pystylometry-1.3.6.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|