freql 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (88) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/CHANGELOG.md +5 -0
  4. data/Gemfile +11 -0
  5. data/Gemfile.lock +42 -0
  6. data/LICENSE.txt +23 -0
  7. data/README.md +89 -0
  8. data/Rakefile +8 -0
  9. data/freql.gemspec +35 -0
  10. data/lib/freql/bindata.rb +55 -0
  11. data/lib/freql/cb.rb +51 -0
  12. data/lib/freql/counter.rb +73 -0
  13. data/lib/freql/data/_chinese_mapping.msgpack.gz +0 -0
  14. data/lib/freql/data/jieba_zh.txt +38811 -0
  15. data/lib/freql/data/jieba_zh_orig.txt +349046 -0
  16. data/lib/freql/data/large_ar.msgpack.gz +0 -0
  17. data/lib/freql/data/large_bn.msgpack.gz +0 -0
  18. data/lib/freql/data/large_ca.msgpack.gz +0 -0
  19. data/lib/freql/data/large_cs.msgpack.gz +0 -0
  20. data/lib/freql/data/large_de.msgpack.gz +0 -0
  21. data/lib/freql/data/large_en.msgpack.gz +0 -0
  22. data/lib/freql/data/large_es.msgpack.gz +0 -0
  23. data/lib/freql/data/large_fi.msgpack.gz +0 -0
  24. data/lib/freql/data/large_fr.msgpack.gz +0 -0
  25. data/lib/freql/data/large_he.msgpack.gz +0 -0
  26. data/lib/freql/data/large_it.msgpack.gz +0 -0
  27. data/lib/freql/data/large_ja.msgpack.gz +0 -0
  28. data/lib/freql/data/large_mk.msgpack.gz +0 -0
  29. data/lib/freql/data/large_nb.msgpack.gz +0 -0
  30. data/lib/freql/data/large_nl.msgpack.gz +0 -0
  31. data/lib/freql/data/large_pl.msgpack.gz +0 -0
  32. data/lib/freql/data/large_pt.msgpack.gz +0 -0
  33. data/lib/freql/data/large_ru.msgpack.gz +0 -0
  34. data/lib/freql/data/large_sv.msgpack.gz +0 -0
  35. data/lib/freql/data/large_uk.msgpack.gz +0 -0
  36. data/lib/freql/data/large_zh.msgpack.gz +0 -0
  37. data/lib/freql/data/small_ar.msgpack.gz +0 -0
  38. data/lib/freql/data/small_bg.msgpack.gz +0 -0
  39. data/lib/freql/data/small_bn.msgpack.gz +0 -0
  40. data/lib/freql/data/small_ca.msgpack.gz +0 -0
  41. data/lib/freql/data/small_cs.msgpack.gz +0 -0
  42. data/lib/freql/data/small_da.msgpack.gz +0 -0
  43. data/lib/freql/data/small_de.msgpack.gz +0 -0
  44. data/lib/freql/data/small_el.msgpack.gz +0 -0
  45. data/lib/freql/data/small_en.msgpack.gz +0 -0
  46. data/lib/freql/data/small_es.msgpack.gz +0 -0
  47. data/lib/freql/data/small_fa.msgpack.gz +0 -0
  48. data/lib/freql/data/small_fi.msgpack.gz +0 -0
  49. data/lib/freql/data/small_fil.msgpack.gz +0 -0
  50. data/lib/freql/data/small_fr.msgpack.gz +0 -0
  51. data/lib/freql/data/small_he.msgpack.gz +0 -0
  52. data/lib/freql/data/small_hi.msgpack.gz +0 -0
  53. data/lib/freql/data/small_hu.msgpack.gz +0 -0
  54. data/lib/freql/data/small_id.msgpack.gz +0 -0
  55. data/lib/freql/data/small_is.msgpack.gz +0 -0
  56. data/lib/freql/data/small_it.msgpack.gz +0 -0
  57. data/lib/freql/data/small_ja.msgpack.gz +0 -0
  58. data/lib/freql/data/small_ko.msgpack.gz +0 -0
  59. data/lib/freql/data/small_lt.msgpack.gz +0 -0
  60. data/lib/freql/data/small_lv.msgpack.gz +0 -0
  61. data/lib/freql/data/small_mk.msgpack.gz +0 -0
  62. data/lib/freql/data/small_ms.msgpack.gz +0 -0
  63. data/lib/freql/data/small_nb.msgpack.gz +0 -0
  64. data/lib/freql/data/small_nl.msgpack.gz +0 -0
  65. data/lib/freql/data/small_pl.msgpack.gz +0 -0
  66. data/lib/freql/data/small_pt.msgpack.gz +0 -0
  67. data/lib/freql/data/small_ro.msgpack.gz +0 -0
  68. data/lib/freql/data/small_ru.msgpack.gz +0 -0
  69. data/lib/freql/data/small_sh.msgpack.gz +0 -0
  70. data/lib/freql/data/small_sk.msgpack.gz +0 -0
  71. data/lib/freql/data/small_sl.msgpack.gz +0 -0
  72. data/lib/freql/data/small_sv.msgpack.gz +0 -0
  73. data/lib/freql/data/small_ta.msgpack.gz +0 -0
  74. data/lib/freql/data/small_tr.msgpack.gz +0 -0
  75. data/lib/freql/data/small_uk.msgpack.gz +0 -0
  76. data/lib/freql/data/small_ur.msgpack.gz +0 -0
  77. data/lib/freql/data/small_vi.msgpack.gz +0 -0
  78. data/lib/freql/data/small_zh.msgpack.gz +0 -0
  79. data/lib/freql/fpbw.rb +28 -0
  80. data/lib/freql/fpmw.rb +41 -0
  81. data/lib/freql/fq.rb +30 -0
  82. data/lib/freql/rank.rb +39 -0
  83. data/lib/freql/version.rb +5 -0
  84. data/lib/freql/words.rb +44 -0
  85. data/lib/freql/zipf.rb +36 -0
  86. data/lib/freql.rb +13 -0
  87. data/sig/freql.rbs +4 -0
  88. metadata +152 -0
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
data/lib/freql/fpbw.rb ADDED
@@ -0,0 +1,28 @@
1
+ module Freql
2
+ module FPBW
3
+ # fpbw frequency per billion words.
4
+ # the same as fpmw but with a billion instead of million.
5
+
6
+ # The advantages over fpbw is that values are far less likely to dip below 1
7
+
8
+ class << self
9
+ def fpbw_to_fq fpbw
10
+ fpbw / 1000000000.0
11
+ end
12
+ def fpbw_to_fpmw fpbw
13
+ fpbw / 1000
14
+ end
15
+ def fpbw_to_zipf fpbw
16
+ Math.log10(fpbw)
17
+ end
18
+ def fpbw_to_cb fpbw
19
+ Math.log10(fpbw / 1000000000.0) * 100.0
20
+ end
21
+
22
+ def calc_fpbw occurances, total
23
+ (occurances / total.to_f) * 1000000000
24
+ end
25
+ end
26
+ end
27
+ end
28
+
data/lib/freql/fpmw.rb ADDED
@@ -0,0 +1,41 @@
1
+ module Freql
2
+ module FPMW
3
+ # fpmw frequency per million words.
4
+ # or the number of times a word occurs in one million words
5
+ # Practical range 53703(the) to 0.01(trella)
6
+ # actual range is from 1000000 to 0
7
+
8
+ # a fpmw of 1 means that word occurs 1 once on average for every million words
9
+ # a fpmw of 1,000,000 would mean every word/token in the corpus was the same.
10
+
11
+ # Advantages
12
+ # - Its straight forward to calculated and understand.
13
+ # - corpus size doesn't change the relative value.
14
+ # - Its a old standard.
15
+
16
+ # Disadvantages:
17
+ # - the issue with fpmw is that rare words can have a fpmw of less than 1
18
+ # - and its not easy for humans to compare.
19
+
20
+ class << self
21
+ def fpmw_to_fq fpmw
22
+ fpmw / 1000000.0
23
+ end
24
+ def fpmw_to_fpbw fpmw
25
+ fpmw * 1000
26
+ end
27
+ def fpmw_to_zipf fpmw
28
+ Math.log10(fpmw * 1000)
29
+ end
30
+ def fpmw_to_cb fpmw
31
+ Math.log10(fpmw / 1000000.0) * 100.0
32
+ end
33
+
34
+ def calc_fpmw occurances, total
35
+ (occurances / total.to_f) * 1000000
36
+ end
37
+ end
38
+ end
39
+
40
+ end
41
+
data/lib/freql/fq.rb ADDED
@@ -0,0 +1,30 @@
1
+ module Freql
2
+ module FQ
3
+ # fq frequency represented as a proportion between 0 and 1
4
+ # occurances in corpus divided by total words in corpus
5
+
6
+ # practicle range 0.053(the) 0.00000001(trella)
7
+ # actual range 0 to 1
8
+
9
+ class << self
10
+ def fq_to_fpmw fq
11
+ fq * 1000000
12
+ end
13
+ def fq_to_fpbw fq
14
+ fq * 1000000000
15
+ end
16
+ def fq_to_zipf fq
17
+ Math.log10(fq)+9
18
+ end
19
+ def fq_to_cb fq
20
+ Math.log10(fq) * 100.0
21
+ end
22
+
23
+ def calc_fq occurances, total
24
+ occurances / total.to_f
25
+ end
26
+ end
27
+
28
+ end
29
+ end
30
+
data/lib/freql/rank.rb ADDED
@@ -0,0 +1,39 @@
1
+ require_relative 'bindata'
2
+ require_relative 'cb'
3
+
4
+ module Freql
5
+ class Rank
6
+
7
+ # lookup the word rank for given words/tokens.
8
+
9
+ class << self
10
+ def by_lang lang, size: :small
11
+ rank = self.new
12
+ rank.extract_lang_file lang, size: size
13
+ return rank
14
+ end
15
+ end
16
+
17
+ attr :ranks
18
+
19
+ def lookup rank
20
+ @ranks[rank-1]
21
+ end
22
+
23
+ def top_n rank
24
+ @ranks[0...rank]
25
+ end
26
+
27
+ def extract_lang_file lang, size:
28
+ BinData.read_lang( lang, size: size ) do |lang_data|
29
+ # @ranks = lang_data.filter {|group| !group.empty? }
30
+ # some words really should have the same ranking as the score is the same.
31
+ # for now ignore that detail and build a simple flat list.
32
+ # Its also wirth noting that the data contains non-words like 000.
33
+ @ranks = lang_data.flatten
34
+ end
35
+ end
36
+ end
37
+ end
38
+
39
+
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Freql
4
+ VERSION = "0.1.0"
5
+ end
@@ -0,0 +1,44 @@
1
+ require_relative 'bindata'
2
+ require_relative 'cb'
3
+
4
+ module Freql
5
+ class Words
6
+
7
+ # Lookup the cb or zipf work frequency for given words/tokens
8
+
9
+ # Words may not be the best name for this class in the future.
10
+
11
+ class << self
12
+ def by_lang lang = :en, size: :small
13
+ w = self.new
14
+ w.extract_lang_file lang, size: size
15
+ return w
16
+ end
17
+ end
18
+
19
+ attr :words
20
+
21
+ def lookup word
22
+ @words[word]
23
+ end
24
+
25
+ def lookup_zipf word
26
+ r = lookup(word)
27
+ r && CB.cb_to_zipf(r)
28
+ end
29
+
30
+ def query *words
31
+ @words.slice(*words)
32
+ end
33
+
34
+ def query_zipf *words
35
+ query(*words).transform_values {|v| CB.cb_to_zipf(v) }
36
+ end
37
+
38
+ def extract_lang_file lang, size:
39
+ BinData.read_and_unpack_lang( lang, size: size ) do |lang_data|
40
+ @words = lang_data
41
+ end
42
+ end
43
+ end
44
+ end
data/lib/freql/zipf.rb ADDED
@@ -0,0 +1,36 @@
1
+ module Freql
2
+ module ZipF
3
+ # ZipF is log10 of frequency per billion words
4
+ # Named after the American linguist George Kingsley Zipf
5
+
6
+ # Practical Range 1-7ish 1.01(the) to 7.73(trella). *(wait i might have that backwards...)*
7
+ # Actual Range is 9.0 to 0.0(or less technically)
8
+
9
+ # Advantages
10
+ # - Its human readable and its a known common standerd.
11
+
12
+ # Disavantages
13
+ # - It requires decimials for accuracy.
14
+ # - Technically it can cross 0 with extremely rare items in large datasets.
15
+
16
+ class << self
17
+ def zipf_to_fq zipf
18
+ 10.00**zipf / 1e9
19
+ end
20
+ def zipf_to_fpmw zipf
21
+ 10.00 ** zipf / 1000
22
+ end
23
+ def zipf_to_fpbw zipf
24
+ 10.00 ** zipf
25
+ end
26
+ def zipf_to_cb zipf
27
+ (zipf * 100.0) - 900.00
28
+ end
29
+
30
+ def calc_zipf occurances, total
31
+ Math.log10(occurances / total.to_f)+9
32
+ end
33
+ end
34
+ end
35
+ end
36
+
data/lib/freql.rb ADDED
@@ -0,0 +1,13 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "freql/version"
4
+ require_relative "freql/bindata"
5
+ require_relative "freql/counter"
6
+ require_relative "freql/words"
7
+
8
+ module Freql
9
+
10
+ class Error < StandardError; end
11
+ # Your code goes here...
12
+
13
+ end
data/sig/freql.rbs ADDED
@@ -0,0 +1,4 @@
1
+ module Freql
2
+ VERSION: String
3
+ # See the writing guide of rbs: https://github.com/ruby/rbs#guides
4
+ end
metadata ADDED
@@ -0,0 +1,152 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: freql
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - opsaaaaa
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2023-06-06 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: msgpack
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.5'
20
+ - - ">="
21
+ - !ruby/object:Gem::Version
22
+ version: 1.5.1
23
+ type: :runtime
24
+ prerelease: false
25
+ version_requirements: !ruby/object:Gem::Requirement
26
+ requirements:
27
+ - - "~>"
28
+ - !ruby/object:Gem::Version
29
+ version: '1.5'
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: 1.5.1
33
+ description: Right now all we do is convert fpmw to zipf and other units.
34
+ email:
35
+ - sean@ferney.org
36
+ executables: []
37
+ extensions: []
38
+ extra_rdoc_files: []
39
+ files:
40
+ - ".rspec"
41
+ - CHANGELOG.md
42
+ - Gemfile
43
+ - Gemfile.lock
44
+ - LICENSE.txt
45
+ - README.md
46
+ - Rakefile
47
+ - freql.gemspec
48
+ - lib/freql.rb
49
+ - lib/freql/bindata.rb
50
+ - lib/freql/cb.rb
51
+ - lib/freql/counter.rb
52
+ - lib/freql/data/_chinese_mapping.msgpack.gz
53
+ - lib/freql/data/jieba_zh.txt
54
+ - lib/freql/data/jieba_zh_orig.txt
55
+ - lib/freql/data/large_ar.msgpack.gz
56
+ - lib/freql/data/large_bn.msgpack.gz
57
+ - lib/freql/data/large_ca.msgpack.gz
58
+ - lib/freql/data/large_cs.msgpack.gz
59
+ - lib/freql/data/large_de.msgpack.gz
60
+ - lib/freql/data/large_en.msgpack.gz
61
+ - lib/freql/data/large_es.msgpack.gz
62
+ - lib/freql/data/large_fi.msgpack.gz
63
+ - lib/freql/data/large_fr.msgpack.gz
64
+ - lib/freql/data/large_he.msgpack.gz
65
+ - lib/freql/data/large_it.msgpack.gz
66
+ - lib/freql/data/large_ja.msgpack.gz
67
+ - lib/freql/data/large_mk.msgpack.gz
68
+ - lib/freql/data/large_nb.msgpack.gz
69
+ - lib/freql/data/large_nl.msgpack.gz
70
+ - lib/freql/data/large_pl.msgpack.gz
71
+ - lib/freql/data/large_pt.msgpack.gz
72
+ - lib/freql/data/large_ru.msgpack.gz
73
+ - lib/freql/data/large_sv.msgpack.gz
74
+ - lib/freql/data/large_uk.msgpack.gz
75
+ - lib/freql/data/large_zh.msgpack.gz
76
+ - lib/freql/data/small_ar.msgpack.gz
77
+ - lib/freql/data/small_bg.msgpack.gz
78
+ - lib/freql/data/small_bn.msgpack.gz
79
+ - lib/freql/data/small_ca.msgpack.gz
80
+ - lib/freql/data/small_cs.msgpack.gz
81
+ - lib/freql/data/small_da.msgpack.gz
82
+ - lib/freql/data/small_de.msgpack.gz
83
+ - lib/freql/data/small_el.msgpack.gz
84
+ - lib/freql/data/small_en.msgpack.gz
85
+ - lib/freql/data/small_es.msgpack.gz
86
+ - lib/freql/data/small_fa.msgpack.gz
87
+ - lib/freql/data/small_fi.msgpack.gz
88
+ - lib/freql/data/small_fil.msgpack.gz
89
+ - lib/freql/data/small_fr.msgpack.gz
90
+ - lib/freql/data/small_he.msgpack.gz
91
+ - lib/freql/data/small_hi.msgpack.gz
92
+ - lib/freql/data/small_hu.msgpack.gz
93
+ - lib/freql/data/small_id.msgpack.gz
94
+ - lib/freql/data/small_is.msgpack.gz
95
+ - lib/freql/data/small_it.msgpack.gz
96
+ - lib/freql/data/small_ja.msgpack.gz
97
+ - lib/freql/data/small_ko.msgpack.gz
98
+ - lib/freql/data/small_lt.msgpack.gz
99
+ - lib/freql/data/small_lv.msgpack.gz
100
+ - lib/freql/data/small_mk.msgpack.gz
101
+ - lib/freql/data/small_ms.msgpack.gz
102
+ - lib/freql/data/small_nb.msgpack.gz
103
+ - lib/freql/data/small_nl.msgpack.gz
104
+ - lib/freql/data/small_pl.msgpack.gz
105
+ - lib/freql/data/small_pt.msgpack.gz
106
+ - lib/freql/data/small_ro.msgpack.gz
107
+ - lib/freql/data/small_ru.msgpack.gz
108
+ - lib/freql/data/small_sh.msgpack.gz
109
+ - lib/freql/data/small_sk.msgpack.gz
110
+ - lib/freql/data/small_sl.msgpack.gz
111
+ - lib/freql/data/small_sv.msgpack.gz
112
+ - lib/freql/data/small_ta.msgpack.gz
113
+ - lib/freql/data/small_tr.msgpack.gz
114
+ - lib/freql/data/small_uk.msgpack.gz
115
+ - lib/freql/data/small_ur.msgpack.gz
116
+ - lib/freql/data/small_vi.msgpack.gz
117
+ - lib/freql/data/small_zh.msgpack.gz
118
+ - lib/freql/fpbw.rb
119
+ - lib/freql/fpmw.rb
120
+ - lib/freql/fq.rb
121
+ - lib/freql/rank.rb
122
+ - lib/freql/version.rb
123
+ - lib/freql/words.rb
124
+ - lib/freql/zipf.rb
125
+ - sig/freql.rbs
126
+ homepage: https://github.com/opsaaaaa/freql
127
+ licenses:
128
+ - MIT
129
+ metadata:
130
+ homepage_uri: https://github.com/opsaaaaa/freql
131
+ source_code_uri: https://github.com/opsaaaaa/freql
132
+ changelog_uri: https://github.com/opsaaaaa/freql/blob/master/CHANGELOG.md
133
+ post_install_message:
134
+ rdoc_options: []
135
+ require_paths:
136
+ - lib
137
+ required_ruby_version: !ruby/object:Gem::Requirement
138
+ requirements:
139
+ - - ">="
140
+ - !ruby/object:Gem::Version
141
+ version: 2.6.0
142
+ required_rubygems_version: !ruby/object:Gem::Requirement
143
+ requirements:
144
+ - - ">="
145
+ - !ruby/object:Gem::Version
146
+ version: '0'
147
+ requirements: []
148
+ rubygems_version: 3.4.1
149
+ signing_key:
150
+ specification_version: 4
151
+ summary: A library for handling word/token freqencies units.
152
+ test_files: []