google-ngrams 0.1.0__py2.py3-none-any.whl → 0.1.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- google_ngrams/vnc.py +38 -79
- {google_ngrams-0.1.0.dist-info → google_ngrams-0.1.1.dist-info}/METADATA +3 -2
- {google_ngrams-0.1.0.dist-info → google_ngrams-0.1.1.dist-info}/RECORD +6 -6
- {google_ngrams-0.1.0.dist-info → google_ngrams-0.1.1.dist-info}/LICENSE +0 -0
- {google_ngrams-0.1.0.dist-info → google_ngrams-0.1.1.dist-info}/WHEEL +0 -0
- {google_ngrams-0.1.0.dist-info → google_ngrams-0.1.1.dist-info}/top_level.txt +0 -0
google_ngrams/vnc.py
CHANGED
@@ -5,6 +5,7 @@ import matplotlib.pyplot as plt
|
|
5
5
|
from textwrap import dedent
|
6
6
|
from matplotlib.figure import Figure
|
7
7
|
from scipy.cluster import hierarchy as sch
|
8
|
+
from statsmodels.gam.api import GLMGam, BSplines
|
8
9
|
|
9
10
|
|
10
11
|
def _linkage_matrix(time_series,
|
@@ -586,53 +587,6 @@ def _vnc_calculate_info(Z: np.ndarray,
|
|
586
587
|
return R
|
587
588
|
|
588
589
|
|
589
|
-
def _lowess(x,
|
590
|
-
y,
|
591
|
-
f=1./3.):
|
592
|
-
"""
|
593
|
-
Basic LOWESS smoother with uncertainty.
|
594
|
-
Note:
|
595
|
-
- Not robust (so no iteration) and
|
596
|
-
only normally distributed errors.
|
597
|
-
- No higher order polynomials d=1
|
598
|
-
so linear smoother.
|
599
|
-
"""
|
600
|
-
# get some paras
|
601
|
-
# effective width after reduction factor
|
602
|
-
xwidth = f*(x.max()-x.min())
|
603
|
-
# number of obs
|
604
|
-
N = len(x)
|
605
|
-
# Don't assume the data is sorted
|
606
|
-
order = np.argsort(x)
|
607
|
-
# storage
|
608
|
-
y_sm = np.zeros_like(y)
|
609
|
-
y_stderr = np.zeros_like(y)
|
610
|
-
# define the weigthing function -- clipping too!
|
611
|
-
tricube = lambda d: np.clip((1 - np.abs(d)**3)**3, 0, 1) # noqa: E731
|
612
|
-
# run the regression for each observation i
|
613
|
-
for i in range(N):
|
614
|
-
dist = np.abs((x[order][i]-x[order]))/xwidth
|
615
|
-
w = tricube(dist)
|
616
|
-
# form linear system with the weights
|
617
|
-
A = np.stack([w, x[order]*w]).T
|
618
|
-
b = w * y[order]
|
619
|
-
ATA = A.T.dot(A)
|
620
|
-
ATb = A.T.dot(b)
|
621
|
-
# solve the syste
|
622
|
-
sol = np.linalg.solve(ATA, ATb)
|
623
|
-
# predict for the observation only
|
624
|
-
# equiv of A.dot(yest) just for k
|
625
|
-
yest = A[i].dot(sol)
|
626
|
-
place = order[i]
|
627
|
-
y_sm[place] = yest
|
628
|
-
sigma2 = (np.sum((A.dot(sol) - y[order])**2)/N)
|
629
|
-
# Calculate the standard error
|
630
|
-
y_stderr[place] = np.sqrt(sigma2 *
|
631
|
-
A[i].dot(np.linalg.inv(ATA)
|
632
|
-
).dot(A[i]))
|
633
|
-
return y_sm, y_stderr
|
634
|
-
|
635
|
-
|
636
590
|
class TimeSeries:
|
637
591
|
|
638
592
|
def __init__(self,
|
@@ -733,6 +687,8 @@ class TimeSeries:
|
|
733
687
|
ax.bar(xx, yy, color=fill_color, edgecolor='black',
|
734
688
|
linewidth=.5, width=barwidth)
|
735
689
|
|
690
|
+
ax.set_ylabel('Frequency (per mil. words)')
|
691
|
+
|
736
692
|
# Despine
|
737
693
|
ax.spines['right'].set_visible(False)
|
738
694
|
ax.spines['top'].set_visible(False)
|
@@ -751,7 +707,8 @@ class TimeSeries:
|
|
751
707
|
dpi=150,
|
752
708
|
point_color='black',
|
753
709
|
point_size=0.5,
|
754
|
-
|
710
|
+
smoothing=7,
|
711
|
+
confidence_interval=True) -> Figure:
|
755
712
|
"""
|
756
713
|
Generate a scatter plot of token frequenices over time
|
757
714
|
with a smoothed fit line and a confidence interval.
|
@@ -768,9 +725,10 @@ class TimeSeries:
|
|
768
725
|
The color of the points.
|
769
726
|
point_size:
|
770
727
|
The size of the points.
|
771
|
-
|
772
|
-
|
773
|
-
|
728
|
+
smoothing:
|
729
|
+
A value between 1 and 9 specifying magnitude of smoothing.
|
730
|
+
confidence_interval:
|
731
|
+
Whether to plot a confidence interval.
|
774
732
|
|
775
733
|
Returns
|
776
734
|
-------
|
@@ -778,43 +736,44 @@ class TimeSeries:
|
|
778
736
|
A matplotlib figure.
|
779
737
|
|
780
738
|
"""
|
781
|
-
|
782
|
-
|
783
|
-
|
739
|
+
if 0 < smoothing and smoothing < 10:
|
740
|
+
smoothing = smoothing
|
741
|
+
else:
|
742
|
+
smoothing = 7
|
743
|
+
|
744
|
+
smothing_value = (10 - smoothing)*10
|
784
745
|
|
785
746
|
xx = self.time_intervals
|
786
747
|
yy = self.frequencies
|
787
748
|
|
788
|
-
|
749
|
+
bs = BSplines(xx, df=smothing_value, degree=3)
|
750
|
+
gam_bs = GLMGam.from_formula('y ~ x',
|
751
|
+
data={'y': yy, 'x': xx},
|
752
|
+
smoother=bs)
|
753
|
+
res_bs = gam_bs.fit()
|
754
|
+
|
755
|
+
# get the fit from the glm
|
756
|
+
fit_line = res_bs.predict()
|
757
|
+
fit_line[fit_line < 0] = 0
|
758
|
+
|
759
|
+
# calculate the upper and lower ce
|
760
|
+
upper = res_bs.predict() + res_bs.partial_values(smooth_index=0)[1]
|
761
|
+
lower = res_bs.predict() - res_bs.partial_values(smooth_index=0)[1]
|
789
762
|
|
790
763
|
fig, ax = plt.subplots(figsize=(width, height), dpi=dpi)
|
791
764
|
|
792
|
-
#
|
793
|
-
|
794
|
-
|
795
|
-
|
796
|
-
|
797
|
-
|
798
|
-
|
799
|
-
|
800
|
-
|
801
|
-
label='95 uncertainty')
|
802
|
-
if ci == 'strict':
|
803
|
-
ax.fill_between(
|
804
|
-
xx[order], y_sm[order] - y_std[order],
|
805
|
-
y_sm[order] + y_std[order], alpha=0.3,
|
806
|
-
label='97.5 uncertainty')
|
807
|
-
if ci == 'both':
|
808
|
-
ax.fill_between(
|
809
|
-
xx[order], y_sm[order] - 1.96*y_std[order],
|
810
|
-
y_sm[order] + 1.96*y_std[order], alpha=0.3,
|
811
|
-
label='95 uncertainty')
|
812
|
-
ax.fill_between(
|
813
|
-
xx[order], y_sm[order] - y_std[order],
|
814
|
-
y_sm[order] + y_std[order], alpha=0.3,
|
815
|
-
label='97.5 uncertainty')
|
765
|
+
# plot fit line
|
766
|
+
ax.plot(xx, fit_line, color='tomato', linewidth=.5)
|
767
|
+
|
768
|
+
# add cofidence interval
|
769
|
+
if confidence_interval is True:
|
770
|
+
ax.fill_between(xx,
|
771
|
+
lower,
|
772
|
+
upper,
|
773
|
+
color='grey', alpha=0.2)
|
816
774
|
|
817
775
|
ax.scatter(xx, yy, s=point_size, color=point_color, alpha=0.75)
|
776
|
+
ax.set_ylabel('Frequency (per mil. words)')
|
818
777
|
|
819
778
|
# Despine
|
820
779
|
ax.spines['right'].set_visible(False)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: google_ngrams
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.1
|
4
4
|
Summary: Fetch and analyze Google Ngram data for specified word forms.
|
5
5
|
Author-email: David Brown <dwb2@andrew.cmu.edu>
|
6
6
|
Maintainer-email: David Brown <dwb2@andrew.cmu.edu>
|
@@ -20,6 +20,7 @@ Requires-Dist: importlib-resources>=6.5
|
|
20
20
|
Requires-Dist: matplotlib>=3.5
|
21
21
|
Requires-Dist: polars>=1.17
|
22
22
|
Requires-Dist: scipy>=1.15
|
23
|
+
Requires-Dist: statsmodels>=0.14
|
23
24
|
|
24
25
|
|
25
26
|
google_ngrams: Fetch and analyze Google Ngram data for specified word forms.
|
@@ -51,7 +52,7 @@ You can install the released version of google_ngrams from `PyPI <https://pypi.o
|
|
51
52
|
|
52
53
|
.. code-block:: install-google_ngrams
|
53
54
|
|
54
|
-
pip install
|
55
|
+
pip install google-ngrams
|
55
56
|
|
56
57
|
|
57
58
|
Usage
|
@@ -1,12 +1,12 @@
|
|
1
1
|
google_ngrams/__init__.py,sha256=yU9IsKP2RkGnWTN_nLchyItQX944LaI3_9k4jC9E3Zw,243
|
2
2
|
google_ngrams/ngrams.py,sha256=qD97qfLQHA61YNACU77EObmUrBom-X7sovckZbF3GK0,6785
|
3
|
-
google_ngrams/vnc.py,sha256=
|
3
|
+
google_ngrams/vnc.py,sha256=zpW35VUgRc-WtJkyHhal1Rm3qD4QGWDSpYuE4ljYZQg,35271
|
4
4
|
google_ngrams/data/__init__.py,sha256=bpfAL5MVH4M27C7KY6sGcmMIb2O7t962ZH_mB6yr8fc,420
|
5
5
|
google_ngrams/data/googlebooks_eng_all_totalcounts_20120701.parquet,sha256=z39hgtEE18o3qAdOc_HpFJOFCPjdgVWDef1rAbZRndQ,12000
|
6
6
|
google_ngrams/data/googlebooks_eng_gb_all_totalcounts_20120701.parquet,sha256=oxcdjSOEFuvt19CiaD0kPapy6Z8lkij0rIDuiJuJUVc,12251
|
7
7
|
google_ngrams/data/googlebooks_eng_us_all_totalcounts_20120701.parquet,sha256=CQi0n-vovKbKYQ7nUWEn7gerGHm8pAusGVpGIx1m9Go,11063
|
8
|
-
google_ngrams-0.1.
|
9
|
-
google_ngrams-0.1.
|
10
|
-
google_ngrams-0.1.
|
11
|
-
google_ngrams-0.1.
|
12
|
-
google_ngrams-0.1.
|
8
|
+
google_ngrams-0.1.1.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
9
|
+
google_ngrams-0.1.1.dist-info/METADATA,sha256=4t2p3L88NxFAJJsQc9uMVPn88J76Gf01p9OTV-DfShs,5573
|
10
|
+
google_ngrams-0.1.1.dist-info/WHEEL,sha256=9Hm2OB-j1QcCUq9Jguht7ayGIIZBRTdOXD1qg9cCgPM,109
|
11
|
+
google_ngrams-0.1.1.dist-info/top_level.txt,sha256=IjVijaqC11yDFtc5yqqDR_ikNpUJT0zu_6AaDpHs0SA,14
|
12
|
+
google_ngrams-0.1.1.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|