freql 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (88) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/CHANGELOG.md +5 -0
  4. data/Gemfile +11 -0
  5. data/Gemfile.lock +42 -0
  6. data/LICENSE.txt +23 -0
  7. data/README.md +89 -0
  8. data/Rakefile +8 -0
  9. data/freql.gemspec +35 -0
  10. data/lib/freql/bindata.rb +55 -0
  11. data/lib/freql/cb.rb +51 -0
  12. data/lib/freql/counter.rb +73 -0
  13. data/lib/freql/data/_chinese_mapping.msgpack.gz +0 -0
  14. data/lib/freql/data/jieba_zh.txt +38811 -0
  15. data/lib/freql/data/jieba_zh_orig.txt +349046 -0
  16. data/lib/freql/data/large_ar.msgpack.gz +0 -0
  17. data/lib/freql/data/large_bn.msgpack.gz +0 -0
  18. data/lib/freql/data/large_ca.msgpack.gz +0 -0
  19. data/lib/freql/data/large_cs.msgpack.gz +0 -0
  20. data/lib/freql/data/large_de.msgpack.gz +0 -0
  21. data/lib/freql/data/large_en.msgpack.gz +0 -0
  22. data/lib/freql/data/large_es.msgpack.gz +0 -0
  23. data/lib/freql/data/large_fi.msgpack.gz +0 -0
  24. data/lib/freql/data/large_fr.msgpack.gz +0 -0
  25. data/lib/freql/data/large_he.msgpack.gz +0 -0
  26. data/lib/freql/data/large_it.msgpack.gz +0 -0
  27. data/lib/freql/data/large_ja.msgpack.gz +0 -0
  28. data/lib/freql/data/large_mk.msgpack.gz +0 -0
  29. data/lib/freql/data/large_nb.msgpack.gz +0 -0
  30. data/lib/freql/data/large_nl.msgpack.gz +0 -0
  31. data/lib/freql/data/large_pl.msgpack.gz +0 -0
  32. data/lib/freql/data/large_pt.msgpack.gz +0 -0
  33. data/lib/freql/data/large_ru.msgpack.gz +0 -0
  34. data/lib/freql/data/large_sv.msgpack.gz +0 -0
  35. data/lib/freql/data/large_uk.msgpack.gz +0 -0
  36. data/lib/freql/data/large_zh.msgpack.gz +0 -0
  37. data/lib/freql/data/small_ar.msgpack.gz +0 -0
  38. data/lib/freql/data/small_bg.msgpack.gz +0 -0
  39. data/lib/freql/data/small_bn.msgpack.gz +0 -0
  40. data/lib/freql/data/small_ca.msgpack.gz +0 -0
  41. data/lib/freql/data/small_cs.msgpack.gz +0 -0
  42. data/lib/freql/data/small_da.msgpack.gz +0 -0
  43. data/lib/freql/data/small_de.msgpack.gz +0 -0
  44. data/lib/freql/data/small_el.msgpack.gz +0 -0
  45. data/lib/freql/data/small_en.msgpack.gz +0 -0
  46. data/lib/freql/data/small_es.msgpack.gz +0 -0
  47. data/lib/freql/data/small_fa.msgpack.gz +0 -0
  48. data/lib/freql/data/small_fi.msgpack.gz +0 -0
  49. data/lib/freql/data/small_fil.msgpack.gz +0 -0
  50. data/lib/freql/data/small_fr.msgpack.gz +0 -0
  51. data/lib/freql/data/small_he.msgpack.gz +0 -0
  52. data/lib/freql/data/small_hi.msgpack.gz +0 -0
  53. data/lib/freql/data/small_hu.msgpack.gz +0 -0
  54. data/lib/freql/data/small_id.msgpack.gz +0 -0
  55. data/lib/freql/data/small_is.msgpack.gz +0 -0
  56. data/lib/freql/data/small_it.msgpack.gz +0 -0
  57. data/lib/freql/data/small_ja.msgpack.gz +0 -0
  58. data/lib/freql/data/small_ko.msgpack.gz +0 -0
  59. data/lib/freql/data/small_lt.msgpack.gz +0 -0
  60. data/lib/freql/data/small_lv.msgpack.gz +0 -0
  61. data/lib/freql/data/small_mk.msgpack.gz +0 -0
  62. data/lib/freql/data/small_ms.msgpack.gz +0 -0
  63. data/lib/freql/data/small_nb.msgpack.gz +0 -0
  64. data/lib/freql/data/small_nl.msgpack.gz +0 -0
  65. data/lib/freql/data/small_pl.msgpack.gz +0 -0
  66. data/lib/freql/data/small_pt.msgpack.gz +0 -0
  67. data/lib/freql/data/small_ro.msgpack.gz +0 -0
  68. data/lib/freql/data/small_ru.msgpack.gz +0 -0
  69. data/lib/freql/data/small_sh.msgpack.gz +0 -0
  70. data/lib/freql/data/small_sk.msgpack.gz +0 -0
  71. data/lib/freql/data/small_sl.msgpack.gz +0 -0
  72. data/lib/freql/data/small_sv.msgpack.gz +0 -0
  73. data/lib/freql/data/small_ta.msgpack.gz +0 -0
  74. data/lib/freql/data/small_tr.msgpack.gz +0 -0
  75. data/lib/freql/data/small_uk.msgpack.gz +0 -0
  76. data/lib/freql/data/small_ur.msgpack.gz +0 -0
  77. data/lib/freql/data/small_vi.msgpack.gz +0 -0
  78. data/lib/freql/data/small_zh.msgpack.gz +0 -0
  79. data/lib/freql/fpbw.rb +28 -0
  80. data/lib/freql/fpmw.rb +41 -0
  81. data/lib/freql/fq.rb +30 -0
  82. data/lib/freql/rank.rb +39 -0
  83. data/lib/freql/version.rb +5 -0
  84. data/lib/freql/words.rb +44 -0
  85. data/lib/freql/zipf.rb +36 -0
  86. data/lib/freql.rb +13 -0
  87. data/sig/freql.rbs +4 -0
  88. metadata +152 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 74eb8d2a60c57b3a8c329845e419c4b72acb74ca49d4acb251dd8866eecdeea9
4
+ data.tar.gz: 795c8b37f96c0decd55a4f3eeb01a4ab14daa33bd02389646606814977cb354d
5
+ SHA512:
6
+ metadata.gz: 3e6c5abbef36d4b09fbc02d2a9b0bbf33ecf8344e519d2074c01b479bfafdf24ab3bf2da0a75f495f2209f46a5b48d000d6a29e9b585452994ed46f79429ae51
7
+ data.tar.gz: 2db83bc03f6044e5ae181bc6cafd0f3ad19e18586a39f7461aef5aaa1d388eef4378802158978241600c1b7012ad4a90b6a168e6509d59408729c25e9938d377
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --format documentation
2
+ --color
3
+ --require spec_helper
data/CHANGELOG.md ADDED
@@ -0,0 +1,5 @@
1
+ ## [Unreleased]
2
+
3
+ ## [0.1.0] - 2023-01-08
4
+
5
+ - Initial release
data/Gemfile ADDED
@@ -0,0 +1,11 @@
1
+ # frozen_string_literal: true
2
+
3
+ source "https://rubygems.org"
4
+
5
+ # Specify your gem's dependencies in freql.gemspec
6
+ gemspec
7
+
8
+ gem "rake", "~> 13.0"
9
+ gem 'pry', '~> 0.14.1'
10
+
11
+ gem "rspec", "~> 3.0"
data/Gemfile.lock ADDED
@@ -0,0 +1,42 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ freql (0.1.0)
5
+ msgpack (~> 1.5, >= 1.5.1)
6
+
7
+ GEM
8
+ remote: https://rubygems.org/
9
+ specs:
10
+ coderay (1.1.3)
11
+ diff-lcs (1.5.0)
12
+ method_source (1.0.0)
13
+ msgpack (1.7.1)
14
+ pry (0.14.2)
15
+ coderay (~> 1.1)
16
+ method_source (~> 1.0)
17
+ rake (13.0.6)
18
+ rspec (3.12.0)
19
+ rspec-core (~> 3.12.0)
20
+ rspec-expectations (~> 3.12.0)
21
+ rspec-mocks (~> 3.12.0)
22
+ rspec-core (3.12.0)
23
+ rspec-support (~> 3.12.0)
24
+ rspec-expectations (3.12.2)
25
+ diff-lcs (>= 1.2.0, < 2.0)
26
+ rspec-support (~> 3.12.0)
27
+ rspec-mocks (3.12.2)
28
+ diff-lcs (>= 1.2.0, < 2.0)
29
+ rspec-support (~> 3.12.0)
30
+ rspec-support (3.12.0)
31
+
32
+ PLATFORMS
33
+ x86_64-linux
34
+
35
+ DEPENDENCIES
36
+ freql!
37
+ pry (~> 0.14.1)
38
+ rake (~> 13.0)
39
+ rspec (~> 3.0)
40
+
41
+ BUNDLED WITH
42
+ 2.4.3
data/LICENSE.txt ADDED
@@ -0,0 +1,23 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2023 opsaaaaa
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
22
+
23
+ Any data under `lib/freql/data` comes from the wordfreq repsoitory and is therefore subject to its License
data/README.md ADDED
@@ -0,0 +1,89 @@
1
+ # Freql
2
+
3
+ *aka. ((word) Freqency Lang/Lib)*
4
+
5
+ A library for handling word/token frequencies.
6
+
7
+
8
+ ## features
9
+ - convert cb and fpmw to zipf and other units.
10
+ - basic lookup for word frequencies in various languages.
11
+ - token counting tool
12
+ - tools for building word/token frequency datasets from custom sources
13
+
14
+
15
+ ## Lets educate you about word frequency units real quick.
16
+
17
+
18
+ | name | desciption | range | examples |
19
+ | --- | --- | --- | --- |
20
+ | fq | frequency represented as a proportion between 0 and 1. Occurrence count divided by total words/tokens | 0 to 1 | 0.053(the) 0.00000001(trella) |
21
+ | fpmw | frequency per million words. | 1 million to 0 | 53703(the) 0.01(trella) |
22
+ | fpbw | frequency per billion words. | 1 billion to 0 | nah |
23
+ | word rank | Frequency rank relative to all the other words within your corpus. | 1+n | the #1 |
24
+ | zipf scale | Its log10 of frequency per billion words. Named after the American linguist George Kingsley Zipf | 9.0 to 0.0(or less technically) | 1.01(the) to 7.73(trella) |
25
+ | cb | Its a word frequency from of logarithmic centibel scale. Basically zipf optimized for storage. | 0 to -900(or less) | -127(the) -799(trella) |
26
+
27
+
28
+ | name | Advantages | Disadvantages |
29
+ | --- | --- | --- |
30
+ | fq | simple | ...lots and lots of decimals |
31
+ | fpmw | Its straight forward to calculated and understand | Its not easy for humans to compare. for some words its less than 1 |
32
+ | fpbw | words arn't going to be less than one. | nobody uses it |
33
+ | zipf scale | Easy for humans to compare. | requires decimals for accuracy |
34
+ | cb | we can safely represent it as a positive integer without sacrificing significant accuracy | less human readable than zipf |
35
+
36
+
37
+ ### Where does cb come from?
38
+
39
+ cb is the word frequency unit used by our inital dataset pulled from the wordfreq program.
40
+ https://github.com/rspeer/wordfreq
41
+
42
+ > 0 cB represents a word that occurs with probability 1, so it is the only
43
+ > word in the data (this of course doesn't happen). -200 cB represents a
44
+ > word that occurs once per 100 tokens, -300 cB represents a word that
45
+ > occurs once per 1000 tokens, and so on.
46
+
47
+ Its very similar to zipf, but with a different scale and 0 point.
48
+ Its always less than 0, so rare values cant cross 0.
49
+ and numbers are larger, so you dont need decimils for reasonable accuracy.
50
+ You can easilly save them as positive integers.
51
+
52
+ In the wordfreq program they 'bin' the data to reduce the file size further.
53
+ `array[ bin[ "words", ...], ... ]`
54
+ The index of the bin represents the positive cb frequency value.
55
+ you end up with a lot of leading empty bins, but after that it gets really efficient.
56
+
57
+
58
+ ## Installation
59
+
60
+ Install the gem and add to the application's Gemfile by executing:
61
+
62
+ $ bundle add freql
63
+
64
+ If bundler is not being used to manage dependencies, install the gem by executing:
65
+
66
+ $ gem install freql
67
+
68
+ ## Usage
69
+
70
+ ## Development
71
+
72
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
73
+
74
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and the created tag, and push the `.gem` file to [rubygems.org](https://rubygems.org).
75
+
76
+ ## Contributing
77
+
78
+ Bug reports and pull requests are welcome on GitHub at https://github.com/[USERNAME]/freql.
79
+
80
+ ## License
81
+
82
+ The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
83
+
84
+ ## Credits
85
+
86
+
87
+ - I read the code from the python wordfq program. https://github.com/rspeer/wordfq
88
+
89
+
data/Rakefile ADDED
@@ -0,0 +1,8 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "bundler/gem_tasks"
4
+ require "rspec/core/rake_task"
5
+
6
+ RSpec::Core::RakeTask.new(:spec)
7
+
8
+ task default: :spec
data/freql.gemspec ADDED
@@ -0,0 +1,35 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "lib/freql/version"
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = "freql"
7
+ spec.version = Freql::VERSION
8
+ spec.authors = ["opsaaaaa"]
9
+ spec.email = ["sean@ferney.org"]
10
+
11
+ spec.summary = "A library for handling word/token freqencies units."
12
+
13
+ spec.description = "Right now all we do is convert fpmw to zipf and other units."
14
+ spec.homepage = "https://github.com/opsaaaaa/freql"
15
+ spec.license = "MIT"
16
+ spec.required_ruby_version = ">= 2.6.0"
17
+
18
+ spec.metadata["homepage_uri"] = spec.homepage
19
+ spec.metadata["source_code_uri"] = spec.homepage
20
+ spec.metadata["changelog_uri"] = "https://github.com/opsaaaaa/freql/blob/master/CHANGELOG.md"
21
+
22
+ # Specify which files should be added to the gem when it is released.
23
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
24
+ spec.files = Dir.chdir(__dir__) do
25
+ `git ls-files -z`.split("\x0").reject do |f|
26
+ (f == __FILE__) || f.match(%r{\A(?:(?:bin|test|spec|features)/|\.(?:git|circleci)|appveyor)})
27
+ end
28
+ end
29
+ spec.bindir = "exe"
30
+ spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
31
+ spec.require_paths = ["lib"]
32
+
33
+ spec.add_runtime_dependency 'msgpack', '~> 1.5', '>= 1.5.1'
34
+
35
+ end
@@ -0,0 +1,55 @@
1
+ require 'zlib'
2
+ require 'msgpack'
3
+
4
+ module Freql
5
+ module BinData
6
+
7
+ # BinData is a tool for compressing key=>integer pair data
8
+ # into an array where the place of the index stores the integer value.
9
+
10
+ # {'three' => 3} > [[],[],[],['three']]
11
+
12
+ # its use in the [wordfq](https://github.com/rspeer/wordfq) program
13
+ # to compress word frequency data.
14
+
15
+
16
+ LANG_FILE_PATH = "lib/freql/data/%s_%s.msgpack.gz"
17
+
18
+ class << self
19
+
20
+ def pack hash_data, size: nil
21
+ size ||= hash_data.values.max
22
+ bin_data = Array.new(size+1) { [] }
23
+
24
+ hash_data.each do |key, val|
25
+ bin_data[val.to_i] << key
26
+ end
27
+
28
+ return bin_data
29
+ end
30
+
31
+ def unpack bin_data
32
+ hash_data = {}
33
+ bin_data.each.with_index do |group, val|
34
+ group.each do |key|
35
+ hash_data[key] = val
36
+ end
37
+ end
38
+ return hash_data
39
+ end
40
+
41
+ def read_lang lang = :en, size: :small, &block
42
+ Zlib::GzipReader.open(LANG_FILE_PATH % [size,lang]) do |gz|
43
+ # The first item in the language data contains version and format information
44
+ # Im choosing to ignore that information for now.
45
+ # The rest of the data is word frequency bindata
46
+ block.call MessagePack.unpack(gz.read)[1..]
47
+ end
48
+ end
49
+
50
+ def read_and_unpack_lang lang = :en, size: :small, &block
51
+ read_lang(lang, size: size) {|data| block.call( unpack(data) ) }
52
+ end
53
+ end
54
+ end
55
+ end
data/lib/freql/cb.rb ADDED
@@ -0,0 +1,51 @@
1
+ module Freql
2
+ module CB
3
+ # CB is a word frequency from of logarithmic centibel scale.
4
+
5
+ # practical range -127(the) to -799
6
+ # actuall range is 0 to -900(or less)
7
+
8
+ # cb is the word frequency unit used the dataset from the python wordfq program.
9
+ # https://github.com/rspeer/wordfq
10
+
11
+ # > 0 cB represents a word that occurs with probability 1, so it is the only
12
+ # > word in the data (this of course doesn't happen). -200 cB represents a
13
+ # > word that occurs once per 100 tokens, -300 cB represents a word that
14
+ # > occurs once per 1000 tokens, and so on.
15
+
16
+ # Advantages
17
+ # - Its very similar to zipf, but with a different scale and 0 point.
18
+ # - Its really good for storage sizes.
19
+ # - Its always less than 0, so rare values cant cross 0.
20
+ # - and numbers are larger, so you dont need decimils for reasonable accuracy.
21
+ # - you can easilly save them as positive integers.
22
+
23
+ # Disadvantages
24
+ # - its less human readable.
25
+
26
+ # In the wordfq program they 'bin' the data to reduce the file size further.
27
+ # array[ bin[ "words", ...], ... ]
28
+ # The index of the bin represents the positive frequency value.
29
+ # you end up with a lot of leading empty bins, but after that it gets really effecient.
30
+
31
+ class << self
32
+ def cb_to_fq cb
33
+ 10.00 ** (-cb.abs / 100.00)
34
+ end
35
+ def cb_to_fpmw cb
36
+ (10.00 ** (-cb.abs / 100.00)) * 1000000
37
+ end
38
+ def cb_to_fpbw cb
39
+ (10.00 ** (-cb.abs / 100.00)) * 1000000000
40
+ end
41
+ def cb_to_zipf cb
42
+ (-cb.abs + 900.00) / 100.00
43
+ end
44
+
45
+ def calc_cb occurances, total
46
+ Math.log10(occurances / total.to_f) * 100.0
47
+ end
48
+ end
49
+ end
50
+ end
51
+
@@ -0,0 +1,73 @@
1
+
2
+ module Freql
3
+ class Counter
4
+
5
+ # Calculate word/token frequencies from various inputs provided.
6
+
7
+
8
+ attr :total
9
+ attr :tokens
10
+
11
+ def initialize tokens: {}, total: 0
12
+ @tokens = tokens
13
+ @total = total
14
+ end
15
+
16
+ def add_array source
17
+ @total += source.length
18
+ source.each do |token|
19
+ add_token(token)
20
+ end
21
+ self
22
+ end
23
+
24
+ def add_words source
25
+ add_matches(source, /\w+/)
26
+ end
27
+
28
+ def add_single_token token
29
+ @total += 1
30
+ add_token(token)
31
+ end
32
+
33
+ def add_matches source, pattern
34
+ add_array(source.scan(pattern))
35
+ end
36
+
37
+ def add_inflated_pairs source, size = 2
38
+ out = []
39
+ for x in 0..(source.length-size) do
40
+ out << source[x...x+size]
41
+ end
42
+ add_array(out)
43
+ end
44
+
45
+ def compute_cb
46
+ @total = @total.to_f
47
+ tokens.transform_values {|count| CB.calc_cb(count,@total)}
48
+ end
49
+
50
+ def compute_zipf
51
+ @total = @total.to_f
52
+ tokens.transform_values {|count| ZipF.calc_zipf(count,@total)}
53
+ end
54
+
55
+ def compute_bindata
56
+ BinData.pack(tokens.transform_values {|count| CB.calc_cb(count,@total).abs.round})
57
+ end
58
+
59
+
60
+ private
61
+
62
+ def add_token token
63
+ if @tokens.has_key?(token)
64
+ @tokens[token] += 1
65
+ else
66
+ @tokens[token] = 1
67
+ end
68
+ end
69
+
70
+
71
+
72
+ end
73
+ end