google-ngrams 0.1.0__py2.py3-none-any.whl → 0.1.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
google_ngrams/vnc.py CHANGED
@@ -5,6 +5,7 @@ import matplotlib.pyplot as plt
5
5
  from textwrap import dedent
6
6
  from matplotlib.figure import Figure
7
7
  from scipy.cluster import hierarchy as sch
8
+ from statsmodels.gam.api import GLMGam, BSplines
8
9
 
9
10
 
10
11
  def _linkage_matrix(time_series,
@@ -586,53 +587,6 @@ def _vnc_calculate_info(Z: np.ndarray,
586
587
  return R
587
588
 
588
589
 
589
- def _lowess(x,
590
- y,
591
- f=1./3.):
592
- """
593
- Basic LOWESS smoother with uncertainty.
594
- Note:
595
- - Not robust (so no iteration) and
596
- only normally distributed errors.
597
- - No higher order polynomials d=1
598
- so linear smoother.
599
- """
600
- # get some paras
601
- # effective width after reduction factor
602
- xwidth = f*(x.max()-x.min())
603
- # number of obs
604
- N = len(x)
605
- # Don't assume the data is sorted
606
- order = np.argsort(x)
607
- # storage
608
- y_sm = np.zeros_like(y)
609
- y_stderr = np.zeros_like(y)
610
- # define the weigthing function -- clipping too!
611
- tricube = lambda d: np.clip((1 - np.abs(d)**3)**3, 0, 1) # noqa: E731
612
- # run the regression for each observation i
613
- for i in range(N):
614
- dist = np.abs((x[order][i]-x[order]))/xwidth
615
- w = tricube(dist)
616
- # form linear system with the weights
617
- A = np.stack([w, x[order]*w]).T
618
- b = w * y[order]
619
- ATA = A.T.dot(A)
620
- ATb = A.T.dot(b)
621
- # solve the syste
622
- sol = np.linalg.solve(ATA, ATb)
623
- # predict for the observation only
624
- # equiv of A.dot(yest) just for k
625
- yest = A[i].dot(sol)
626
- place = order[i]
627
- y_sm[place] = yest
628
- sigma2 = (np.sum((A.dot(sol) - y[order])**2)/N)
629
- # Calculate the standard error
630
- y_stderr[place] = np.sqrt(sigma2 *
631
- A[i].dot(np.linalg.inv(ATA)
632
- ).dot(A[i]))
633
- return y_sm, y_stderr
634
-
635
-
636
590
  class TimeSeries:
637
591
 
638
592
  def __init__(self,
@@ -733,6 +687,8 @@ class TimeSeries:
733
687
  ax.bar(xx, yy, color=fill_color, edgecolor='black',
734
688
  linewidth=.5, width=barwidth)
735
689
 
690
+ ax.set_ylabel('Frequency (per mil. words)')
691
+
736
692
  # Despine
737
693
  ax.spines['right'].set_visible(False)
738
694
  ax.spines['top'].set_visible(False)
@@ -751,7 +707,8 @@ class TimeSeries:
751
707
  dpi=150,
752
708
  point_color='black',
753
709
  point_size=0.5,
754
- ci='standard') -> Figure:
710
+ smoothing=7,
711
+ confidence_interval=True) -> Figure:
755
712
  """
756
713
  Generate a scatter plot of token frequenices over time
757
714
  with a smoothed fit line and a confidence interval.
@@ -768,9 +725,10 @@ class TimeSeries:
768
725
  The color of the points.
769
726
  point_size:
770
727
  The size of the points.
771
- ci:
772
- The confidence interval. One of "standard" (95%),
773
- "strict" (97.5%) or "both".
728
+ smoothing:
729
+ A value between 1 and 9 specifying magnitude of smoothing.
730
+ confidence_interval:
731
+ Whether to plot a confidence interval.
774
732
 
775
733
  Returns
776
734
  -------
@@ -778,43 +736,44 @@ class TimeSeries:
778
736
  A matplotlib figure.
779
737
 
780
738
  """
781
- ci_types = ['standard', 'strict', 'both']
782
- if ci not in ci_types:
783
- ci = "standard"
739
+ if 0 < smoothing and smoothing < 10:
740
+ smoothing = smoothing
741
+ else:
742
+ smoothing = 7
743
+
744
+ smothing_value = (10 - smoothing)*10
784
745
 
785
746
  xx = self.time_intervals
786
747
  yy = self.frequencies
787
748
 
788
- order = np.argsort(xx)
749
+ bs = BSplines(xx, df=smothing_value, degree=3)
750
+ gam_bs = GLMGam.from_formula('y ~ x',
751
+ data={'y': yy, 'x': xx},
752
+ smoother=bs)
753
+ res_bs = gam_bs.fit()
754
+
755
+ # get the fit from the glm
756
+ fit_line = res_bs.predict()
757
+ fit_line[fit_line < 0] = 0
758
+
759
+ # calculate the upper and lower ce
760
+ upper = res_bs.predict() + res_bs.partial_values(smooth_index=0)[1]
761
+ lower = res_bs.predict() - res_bs.partial_values(smooth_index=0)[1]
789
762
 
790
763
  fig, ax = plt.subplots(figsize=(width, height), dpi=dpi)
791
764
 
792
- # run it
793
- y_sm, y_std = _lowess(xx, yy, f=1./5.)
794
- # plot it
795
- ax.plot(xx[order], y_sm[order],
796
- color='tomato', linewidth=.5, label='LOWESS')
797
- if ci == 'standard':
798
- ax.fill_between(
799
- xx[order], y_sm[order] - 1.96*y_std[order],
800
- y_sm[order] + 1.96*y_std[order], alpha=0.3,
801
- label='95 uncertainty')
802
- if ci == 'strict':
803
- ax.fill_between(
804
- xx[order], y_sm[order] - y_std[order],
805
- y_sm[order] + y_std[order], alpha=0.3,
806
- label='97.5 uncertainty')
807
- if ci == 'both':
808
- ax.fill_between(
809
- xx[order], y_sm[order] - 1.96*y_std[order],
810
- y_sm[order] + 1.96*y_std[order], alpha=0.3,
811
- label='95 uncertainty')
812
- ax.fill_between(
813
- xx[order], y_sm[order] - y_std[order],
814
- y_sm[order] + y_std[order], alpha=0.3,
815
- label='97.5 uncertainty')
765
+ # plot fit line
766
+ ax.plot(xx, fit_line, color='tomato', linewidth=.5)
767
+
768
+ # add cofidence interval
769
+ if confidence_interval is True:
770
+ ax.fill_between(xx,
771
+ lower,
772
+ upper,
773
+ color='grey', alpha=0.2)
816
774
 
817
775
  ax.scatter(xx, yy, s=point_size, color=point_color, alpha=0.75)
776
+ ax.set_ylabel('Frequency (per mil. words)')
818
777
 
819
778
  # Despine
820
779
  ax.spines['right'].set_visible(False)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: google_ngrams
3
- Version: 0.1.0
3
+ Version: 0.1.1
4
4
  Summary: Fetch and analyze Google Ngram data for specified word forms.
5
5
  Author-email: David Brown <dwb2@andrew.cmu.edu>
6
6
  Maintainer-email: David Brown <dwb2@andrew.cmu.edu>
@@ -20,6 +20,7 @@ Requires-Dist: importlib-resources>=6.5
20
20
  Requires-Dist: matplotlib>=3.5
21
21
  Requires-Dist: polars>=1.17
22
22
  Requires-Dist: scipy>=1.15
23
+ Requires-Dist: statsmodels>=0.14
23
24
 
24
25
 
25
26
  google_ngrams: Fetch and analyze Google Ngram data for specified word forms.
@@ -51,7 +52,7 @@ You can install the released version of google_ngrams from `PyPI <https://pypi.o
51
52
 
52
53
  .. code-block:: install-google_ngrams
53
54
 
54
- pip install google_ngrams
55
+ pip install google-ngrams
55
56
 
56
57
 
57
58
  Usage
@@ -1,12 +1,12 @@
1
1
  google_ngrams/__init__.py,sha256=yU9IsKP2RkGnWTN_nLchyItQX944LaI3_9k4jC9E3Zw,243
2
2
  google_ngrams/ngrams.py,sha256=qD97qfLQHA61YNACU77EObmUrBom-X7sovckZbF3GK0,6785
3
- google_ngrams/vnc.py,sha256=ebCzcUhr7qGrxrzOrFN-BG7ZSF5xezb8rbHaZ9uKEbU,36643
3
+ google_ngrams/vnc.py,sha256=zpW35VUgRc-WtJkyHhal1Rm3qD4QGWDSpYuE4ljYZQg,35271
4
4
  google_ngrams/data/__init__.py,sha256=bpfAL5MVH4M27C7KY6sGcmMIb2O7t962ZH_mB6yr8fc,420
5
5
  google_ngrams/data/googlebooks_eng_all_totalcounts_20120701.parquet,sha256=z39hgtEE18o3qAdOc_HpFJOFCPjdgVWDef1rAbZRndQ,12000
6
6
  google_ngrams/data/googlebooks_eng_gb_all_totalcounts_20120701.parquet,sha256=oxcdjSOEFuvt19CiaD0kPapy6Z8lkij0rIDuiJuJUVc,12251
7
7
  google_ngrams/data/googlebooks_eng_us_all_totalcounts_20120701.parquet,sha256=CQi0n-vovKbKYQ7nUWEn7gerGHm8pAusGVpGIx1m9Go,11063
8
- google_ngrams-0.1.0.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
9
- google_ngrams-0.1.0.dist-info/METADATA,sha256=N5OLJiznNmjZPGRLKf99k0a2ehgmnBt8SX_J7r4vfmI,5540
10
- google_ngrams-0.1.0.dist-info/WHEEL,sha256=9Hm2OB-j1QcCUq9Jguht7ayGIIZBRTdOXD1qg9cCgPM,109
11
- google_ngrams-0.1.0.dist-info/top_level.txt,sha256=IjVijaqC11yDFtc5yqqDR_ikNpUJT0zu_6AaDpHs0SA,14
12
- google_ngrams-0.1.0.dist-info/RECORD,,
8
+ google_ngrams-0.1.1.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
9
+ google_ngrams-0.1.1.dist-info/METADATA,sha256=4t2p3L88NxFAJJsQc9uMVPn88J76Gf01p9OTV-DfShs,5573
10
+ google_ngrams-0.1.1.dist-info/WHEEL,sha256=9Hm2OB-j1QcCUq9Jguht7ayGIIZBRTdOXD1qg9cCgPM,109
11
+ google_ngrams-0.1.1.dist-info/top_level.txt,sha256=IjVijaqC11yDFtc5yqqDR_ikNpUJT0zu_6AaDpHs0SA,14
12
+ google_ngrams-0.1.1.dist-info/RECORD,,