freql 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/CHANGELOG.md +5 -0
  4. data/Gemfile +11 -0
  5. data/Gemfile.lock +42 -0
  6. data/LICENSE.txt +23 -0
  7. data/README.md +89 -0
  8. data/Rakefile +8 -0
  9. data/freql.gemspec +35 -0
  10. data/lib/freql/bindata.rb +55 -0
  11. data/lib/freql/cb.rb +51 -0
  12. data/lib/freql/counter.rb +73 -0
  13. data/lib/freql/data/_chinese_mapping.msgpack.gz +0 -0
  14. data/lib/freql/data/jieba_zh.txt +38811 -0
  15. data/lib/freql/data/jieba_zh_orig.txt +349046 -0
  16. data/lib/freql/data/large_ar.msgpack.gz +0 -0
  17. data/lib/freql/data/large_bn.msgpack.gz +0 -0
  18. data/lib/freql/data/large_ca.msgpack.gz +0 -0
  19. data/lib/freql/data/large_cs.msgpack.gz +0 -0
  20. data/lib/freql/data/large_de.msgpack.gz +0 -0
  21. data/lib/freql/data/large_en.msgpack.gz +0 -0
  22. data/lib/freql/data/large_es.msgpack.gz +0 -0
  23. data/lib/freql/data/large_fi.msgpack.gz +0 -0
  24. data/lib/freql/data/large_fr.msgpack.gz +0 -0
  25. data/lib/freql/data/large_he.msgpack.gz +0 -0
  26. data/lib/freql/data/large_it.msgpack.gz +0 -0
  27. data/lib/freql/data/large_ja.msgpack.gz +0 -0
  28. data/lib/freql/data/large_mk.msgpack.gz +0 -0
  29. data/lib/freql/data/large_nb.msgpack.gz +0 -0
  30. data/lib/freql/data/large_nl.msgpack.gz +0 -0
  31. data/lib/freql/data/large_pl.msgpack.gz +0 -0
  32. data/lib/freql/data/large_pt.msgpack.gz +0 -0
  33. data/lib/freql/data/large_ru.msgpack.gz +0 -0
  34. data/lib/freql/data/large_sv.msgpack.gz +0 -0
  35. data/lib/freql/data/large_uk.msgpack.gz +0 -0
  36. data/lib/freql/data/large_zh.msgpack.gz +0 -0
  37. data/lib/freql/data/small_ar.msgpack.gz +0 -0
  38. data/lib/freql/data/small_bg.msgpack.gz +0 -0
  39. data/lib/freql/data/small_bn.msgpack.gz +0 -0
  40. data/lib/freql/data/small_ca.msgpack.gz +0 -0
  41. data/lib/freql/data/small_cs.msgpack.gz +0 -0
  42. data/lib/freql/data/small_da.msgpack.gz +0 -0
  43. data/lib/freql/data/small_de.msgpack.gz +0 -0
  44. data/lib/freql/data/small_el.msgpack.gz +0 -0
  45. data/lib/freql/data/small_en.msgpack.gz +0 -0
  46. data/lib/freql/data/small_es.msgpack.gz +0 -0
  47. data/lib/freql/data/small_fa.msgpack.gz +0 -0
  48. data/lib/freql/data/small_fi.msgpack.gz +0 -0
  49. data/lib/freql/data/small_fil.msgpack.gz +0 -0
  50. data/lib/freql/data/small_fr.msgpack.gz +0 -0
  51. data/lib/freql/data/small_he.msgpack.gz +0 -0
  52. data/lib/freql/data/small_hi.msgpack.gz +0 -0
  53. data/lib/freql/data/small_hu.msgpack.gz +0 -0
  54. data/lib/freql/data/small_id.msgpack.gz +0 -0
  55. data/lib/freql/data/small_is.msgpack.gz +0 -0
  56. data/lib/freql/data/small_it.msgpack.gz +0 -0
  57. data/lib/freql/data/small_ja.msgpack.gz +0 -0
  58. data/lib/freql/data/small_ko.msgpack.gz +0 -0
  59. data/lib/freql/data/small_lt.msgpack.gz +0 -0
  60. data/lib/freql/data/small_lv.msgpack.gz +0 -0
  61. data/lib/freql/data/small_mk.msgpack.gz +0 -0
  62. data/lib/freql/data/small_ms.msgpack.gz +0 -0
  63. data/lib/freql/data/small_nb.msgpack.gz +0 -0
  64. data/lib/freql/data/small_nl.msgpack.gz +0 -0
  65. data/lib/freql/data/small_pl.msgpack.gz +0 -0
  66. data/lib/freql/data/small_pt.msgpack.gz +0 -0
  67. data/lib/freql/data/small_ro.msgpack.gz +0 -0
  68. data/lib/freql/data/small_ru.msgpack.gz +0 -0
  69. data/lib/freql/data/small_sh.msgpack.gz +0 -0
  70. data/lib/freql/data/small_sk.msgpack.gz +0 -0
  71. data/lib/freql/data/small_sl.msgpack.gz +0 -0
  72. data/lib/freql/data/small_sv.msgpack.gz +0 -0
  73. data/lib/freql/data/small_ta.msgpack.gz +0 -0
  74. data/lib/freql/data/small_tr.msgpack.gz +0 -0
  75. data/lib/freql/data/small_uk.msgpack.gz +0 -0
  76. data/lib/freql/data/small_ur.msgpack.gz +0 -0
  77. data/lib/freql/data/small_vi.msgpack.gz +0 -0
  78. data/lib/freql/data/small_zh.msgpack.gz +0 -0
  79. data/lib/freql/fpbw.rb +28 -0
  80. data/lib/freql/fpmw.rb +41 -0
  81. data/lib/freql/fq.rb +30 -0
  82. data/lib/freql/rank.rb +39 -0
  83. data/lib/freql/version.rb +5 -0
  84. data/lib/freql/words.rb +44 -0
  85. data/lib/freql/zipf.rb +36 -0
  86. data/lib/freql.rb +13 -0
  87. data/sig/freql.rbs +4 -0
  88. metadata +152 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 74eb8d2a60c57b3a8c329845e419c4b72acb74ca49d4acb251dd8866eecdeea9
4
+ data.tar.gz: 795c8b37f96c0decd55a4f3eeb01a4ab14daa33bd02389646606814977cb354d
5
+ SHA512:
6
+ metadata.gz: 3e6c5abbef36d4b09fbc02d2a9b0bbf33ecf8344e519d2074c01b479bfafdf24ab3bf2da0a75f495f2209f46a5b48d000d6a29e9b585452994ed46f79429ae51
7
+ data.tar.gz: 2db83bc03f6044e5ae181bc6cafd0f3ad19e18586a39f7461aef5aaa1d388eef4378802158978241600c1b7012ad4a90b6a168e6509d59408729c25e9938d377
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --format documentation
2
+ --color
3
+ --require spec_helper
data/CHANGELOG.md ADDED
@@ -0,0 +1,5 @@
1
+ ## [Unreleased]
2
+
3
+ ## [0.1.0] - 2023-01-08
4
+
5
+ - Initial release
data/Gemfile ADDED
@@ -0,0 +1,11 @@
1
+ # frozen_string_literal: true
2
+
3
+ source "https://rubygems.org"
4
+
5
+ # Specify your gem's dependencies in freql.gemspec
6
+ gemspec
7
+
8
+ gem "rake", "~> 13.0"
9
+ gem 'pry', '~> 0.14.1'
10
+
11
+ gem "rspec", "~> 3.0"
data/Gemfile.lock ADDED
@@ -0,0 +1,42 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ freql (0.1.0)
5
+ msgpack (~> 1.5, >= 1.5.1)
6
+
7
+ GEM
8
+ remote: https://rubygems.org/
9
+ specs:
10
+ coderay (1.1.3)
11
+ diff-lcs (1.5.0)
12
+ method_source (1.0.0)
13
+ msgpack (1.7.1)
14
+ pry (0.14.2)
15
+ coderay (~> 1.1)
16
+ method_source (~> 1.0)
17
+ rake (13.0.6)
18
+ rspec (3.12.0)
19
+ rspec-core (~> 3.12.0)
20
+ rspec-expectations (~> 3.12.0)
21
+ rspec-mocks (~> 3.12.0)
22
+ rspec-core (3.12.0)
23
+ rspec-support (~> 3.12.0)
24
+ rspec-expectations (3.12.2)
25
+ diff-lcs (>= 1.2.0, < 2.0)
26
+ rspec-support (~> 3.12.0)
27
+ rspec-mocks (3.12.2)
28
+ diff-lcs (>= 1.2.0, < 2.0)
29
+ rspec-support (~> 3.12.0)
30
+ rspec-support (3.12.0)
31
+
32
+ PLATFORMS
33
+ x86_64-linux
34
+
35
+ DEPENDENCIES
36
+ freql!
37
+ pry (~> 0.14.1)
38
+ rake (~> 13.0)
39
+ rspec (~> 3.0)
40
+
41
+ BUNDLED WITH
42
+ 2.4.3
data/LICENSE.txt ADDED
@@ -0,0 +1,23 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2023 opsaaaaa
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
22
+
23
+ Any data under `lib/freql/data` comes from the wordfreq repsoitory and is therefore subject to its License
data/README.md ADDED
@@ -0,0 +1,89 @@
1
+ # Freql
2
+
3
+ *aka. ((word) Freqency Lang/Lib)*
4
+
5
+ A library for handling word/token frequencies.
6
+
7
+
8
+ ## features
9
+ - convert cb and fpmw to zipf and other units.
10
+ - basic lookup for word frequencies in various languages.
11
+ - token counting tool
12
+ - tools for building word/token frequency datasets from custom sources
13
+
14
+
15
+ ## Lets educate you about word frequency units real quick.
16
+
17
+
18
+ | name | desciption | range | examples |
19
+ | --- | --- | --- | --- |
20
+ | fq | frequency represented as a proportion between 0 and 1. Occurrence count divided by total words/tokens | 0 to 1 | 0.053(the) 0.00000001(trella) |
21
+ | fpmw | frequency per million words. | 1 million to 0 | 53703(the) 0.01(trella) |
22
+ | fpbw | frequency per billion words. | 1 billion to 0 | nah |
23
+ | word rank | Frequency rank relative to all the other words within your corpus. | 1+n | the #1 |
24
+ | zipf scale | Its log10 of frequency per billion words. Named after the American linguist George Kingsley Zipf | 9.0 to 0.0(or less technically) | 1.01(the) to 7.73(trella) |
25
+ | cb | Its a word frequency from of logarithmic centibel scale. Basically zipf optimized for storage. | 0 to -900(or less) | -127(the) -799(trella) |
26
+
27
+
28
+ | name | Advantages | Disadvantages |
29
+ | --- | --- | --- |
30
+ | fq | simple | ...lots and lots of decimals |
31
+ | fpmw | Its straight forward to calculated and understand | Its not easy for humans to compare. for some words its less than 1 |
32
+ | fpbw | words arn't going to be less than one. | nobody uses it |
33
+ | zipf scale | Easy for humans to compare. | requires decimals for accuracy |
34
+ | cb | we can safely represent it as a positive integer without sacrificing significant accuracy | less human readable than zipf |
35
+
36
+
37
+ ### Where does cb come from?
38
+
39
+ cb is the word frequency unit used by our inital dataset pulled from the wordfreq program.
40
+ https://github.com/rspeer/wordfreq
41
+
42
+ > 0 cB represents a word that occurs with probability 1, so it is the only
43
+ > word in the data (this of course doesn't happen). -200 cB represents a
44
+ > word that occurs once per 100 tokens, -300 cB represents a word that
45
+ > occurs once per 1000 tokens, and so on.
46
+
47
+ Its very similar to zipf, but with a different scale and 0 point.
48
+ Its always less than 0, so rare values cant cross 0.
49
+ and numbers are larger, so you dont need decimils for reasonable accuracy.
50
+ You can easilly save them as positive integers.
51
+
52
+ In the wordfreq program they 'bin' the data to reduce the file size further.
53
+ `array[ bin[ "words", ...], ... ]`
54
+ The index of the bin represents the positive cb frequency value.
55
+ you end up with a lot of leading empty bins, but after that it gets really efficient.
56
+
57
+
58
+ ## Installation
59
+
60
+ Install the gem and add to the application's Gemfile by executing:
61
+
62
+ $ bundle add freql
63
+
64
+ If bundler is not being used to manage dependencies, install the gem by executing:
65
+
66
+ $ gem install freql
67
+
68
+ ## Usage
69
+
70
+ ## Development
71
+
72
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
73
+
74
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and the created tag, and push the `.gem` file to [rubygems.org](https://rubygems.org).
75
+
76
+ ## Contributing
77
+
78
+ Bug reports and pull requests are welcome on GitHub at https://github.com/[USERNAME]/freql.
79
+
80
+ ## License
81
+
82
+ The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
83
+
84
+ ## Credits
85
+
86
+
87
+ - I read the code from the python wordfq program. https://github.com/rspeer/wordfq
88
+
89
+
data/Rakefile ADDED
@@ -0,0 +1,8 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "bundler/gem_tasks"
4
+ require "rspec/core/rake_task"
5
+
6
+ RSpec::Core::RakeTask.new(:spec)
7
+
8
+ task default: :spec
data/freql.gemspec ADDED
@@ -0,0 +1,35 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "lib/freql/version"
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = "freql"
7
+ spec.version = Freql::VERSION
8
+ spec.authors = ["opsaaaaa"]
9
+ spec.email = ["sean@ferney.org"]
10
+
11
+ spec.summary = "A library for handling word/token freqencies units."
12
+
13
+ spec.description = "Right now all we do is convert fpmw to zipf and other units."
14
+ spec.homepage = "https://github.com/opsaaaaa/freql"
15
+ spec.license = "MIT"
16
+ spec.required_ruby_version = ">= 2.6.0"
17
+
18
+ spec.metadata["homepage_uri"] = spec.homepage
19
+ spec.metadata["source_code_uri"] = spec.homepage
20
+ spec.metadata["changelog_uri"] = "https://github.com/opsaaaaa/freql/blob/master/CHANGELOG.md"
21
+
22
+ # Specify which files should be added to the gem when it is released.
23
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
24
+ spec.files = Dir.chdir(__dir__) do
25
+ `git ls-files -z`.split("\x0").reject do |f|
26
+ (f == __FILE__) || f.match(%r{\A(?:(?:bin|test|spec|features)/|\.(?:git|circleci)|appveyor)})
27
+ end
28
+ end
29
+ spec.bindir = "exe"
30
+ spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
31
+ spec.require_paths = ["lib"]
32
+
33
+ spec.add_runtime_dependency 'msgpack', '~> 1.5', '>= 1.5.1'
34
+
35
+ end
@@ -0,0 +1,55 @@
1
+ require 'zlib'
2
+ require 'msgpack'
3
+
4
+ module Freql
5
+ module BinData
6
+
7
+ # BinData is a tool for compressing key=>integer pair data
8
+ # into an array where the place of the index stores the integer value.
9
+
10
+ # {'three' => 3} > [[],[],[],['three']]
11
+
12
+ # its use in the [wordfq](https://github.com/rspeer/wordfq) program
13
+ # to compress word frequency data.
14
+
15
+
16
+ LANG_FILE_PATH = "lib/freql/data/%s_%s.msgpack.gz"
17
+
18
+ class << self
19
+
20
+ def pack hash_data, size: nil
21
+ size ||= hash_data.values.max
22
+ bin_data = Array.new(size+1) { [] }
23
+
24
+ hash_data.each do |key, val|
25
+ bin_data[val.to_i] << key
26
+ end
27
+
28
+ return bin_data
29
+ end
30
+
31
+ def unpack bin_data
32
+ hash_data = {}
33
+ bin_data.each.with_index do |group, val|
34
+ group.each do |key|
35
+ hash_data[key] = val
36
+ end
37
+ end
38
+ return hash_data
39
+ end
40
+
41
+ def read_lang lang = :en, size: :small, &block
42
+ Zlib::GzipReader.open(LANG_FILE_PATH % [size,lang]) do |gz|
43
+ # The first item in the language data contains version and format information
44
+ # Im choosing to ignore that information for now.
45
+ # The rest of the data is word frequency bindata
46
+ block.call MessagePack.unpack(gz.read)[1..]
47
+ end
48
+ end
49
+
50
+ def read_and_unpack_lang lang = :en, size: :small, &block
51
+ read_lang(lang, size: size) {|data| block.call( unpack(data) ) }
52
+ end
53
+ end
54
+ end
55
+ end
data/lib/freql/cb.rb ADDED
@@ -0,0 +1,51 @@
1
+ module Freql
2
+ module CB
3
+ # CB is a word frequency from of logarithmic centibel scale.
4
+
5
+ # practical range -127(the) to -799
6
+ # actuall range is 0 to -900(or less)
7
+
8
+ # cb is the word frequency unit used the dataset from the python wordfq program.
9
+ # https://github.com/rspeer/wordfq
10
+
11
+ # > 0 cB represents a word that occurs with probability 1, so it is the only
12
+ # > word in the data (this of course doesn't happen). -200 cB represents a
13
+ # > word that occurs once per 100 tokens, -300 cB represents a word that
14
+ # > occurs once per 1000 tokens, and so on.
15
+
16
+ # Advantages
17
+ # - Its very similar to zipf, but with a different scale and 0 point.
18
+ # - Its really good for storage sizes.
19
+ # - Its always less than 0, so rare values cant cross 0.
20
+ # - and numbers are larger, so you dont need decimils for reasonable accuracy.
21
+ # - you can easilly save them as positive integers.
22
+
23
+ # Disadvantages
24
+ # - its less human readable.
25
+
26
+ # In the wordfq program they 'bin' the data to reduce the file size further.
27
+ # array[ bin[ "words", ...], ... ]
28
+ # The index of the bin represents the positive frequency value.
29
+ # you end up with a lot of leading empty bins, but after that it gets really effecient.
30
+
31
+ class << self
32
+ def cb_to_fq cb
33
+ 10.00 ** (-cb.abs / 100.00)
34
+ end
35
+ def cb_to_fpmw cb
36
+ (10.00 ** (-cb.abs / 100.00)) * 1000000
37
+ end
38
+ def cb_to_fpbw cb
39
+ (10.00 ** (-cb.abs / 100.00)) * 1000000000
40
+ end
41
+ def cb_to_zipf cb
42
+ (-cb.abs + 900.00) / 100.00
43
+ end
44
+
45
+ def calc_cb occurances, total
46
+ Math.log10(occurances / total.to_f) * 100.0
47
+ end
48
+ end
49
+ end
50
+ end
51
+
@@ -0,0 +1,73 @@
1
+
2
+ module Freql
3
+ class Counter
4
+
5
+ # Calculate word/token frequencies from various inputs provided.
6
+
7
+
8
+ attr :total
9
+ attr :tokens
10
+
11
+ def initialize tokens: {}, total: 0
12
+ @tokens = tokens
13
+ @total = total
14
+ end
15
+
16
+ def add_array source
17
+ @total += source.length
18
+ source.each do |token|
19
+ add_token(token)
20
+ end
21
+ self
22
+ end
23
+
24
+ def add_words source
25
+ add_matches(source, /\w+/)
26
+ end
27
+
28
+ def add_single_token token
29
+ @total += 1
30
+ add_token(token)
31
+ end
32
+
33
+ def add_matches source, pattern
34
+ add_array(source.scan(pattern))
35
+ end
36
+
37
+ def add_inflated_pairs source, size = 2
38
+ out = []
39
+ for x in 0..(source.length-size) do
40
+ out << source[x...x+size]
41
+ end
42
+ add_array(out)
43
+ end
44
+
45
+ def compute_cb
46
+ @total = @total.to_f
47
+ tokens.transform_values {|count| CB.calc_cb(count,@total)}
48
+ end
49
+
50
+ def compute_zipf
51
+ @total = @total.to_f
52
+ tokens.transform_values {|count| ZipF.calc_zipf(count,@total)}
53
+ end
54
+
55
+ def compute_bindata
56
+ BinData.pack(tokens.transform_values {|count| CB.calc_cb(count,@total).abs.round})
57
+ end
58
+
59
+
60
+ private
61
+
62
+ def add_token token
63
+ if @tokens.has_key?(token)
64
+ @tokens[token] += 1
65
+ else
66
+ @tokens[token] = 1
67
+ end
68
+ end
69
+
70
+
71
+
72
+ end
73
+ end