RubyGems - freql - Versions diffs - 0.1.0 - Mend

freql 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (88) hide show

checksums.yaml +7 -0
data/.rspec +3 -0
data/CHANGELOG.md +5 -0
data/Gemfile +11 -0
data/Gemfile.lock +42 -0
data/LICENSE.txt +23 -0
data/README.md +89 -0
data/Rakefile +8 -0
data/freql.gemspec +35 -0
data/lib/freql/bindata.rb +55 -0
data/lib/freql/cb.rb +51 -0
data/lib/freql/counter.rb +73 -0
data/lib/freql/data/_chinese_mapping.msgpack.gz +0 -0
data/lib/freql/data/jieba_zh.txt +38811 -0
data/lib/freql/data/jieba_zh_orig.txt +349046 -0
data/lib/freql/data/large_ar.msgpack.gz +0 -0
data/lib/freql/data/large_bn.msgpack.gz +0 -0
data/lib/freql/data/large_ca.msgpack.gz +0 -0
data/lib/freql/data/large_cs.msgpack.gz +0 -0
data/lib/freql/data/large_de.msgpack.gz +0 -0
data/lib/freql/data/large_en.msgpack.gz +0 -0
data/lib/freql/data/large_es.msgpack.gz +0 -0
data/lib/freql/data/large_fi.msgpack.gz +0 -0
data/lib/freql/data/large_fr.msgpack.gz +0 -0
data/lib/freql/data/large_he.msgpack.gz +0 -0
data/lib/freql/data/large_it.msgpack.gz +0 -0
data/lib/freql/data/large_ja.msgpack.gz +0 -0
data/lib/freql/data/large_mk.msgpack.gz +0 -0
data/lib/freql/data/large_nb.msgpack.gz +0 -0
data/lib/freql/data/large_nl.msgpack.gz +0 -0
data/lib/freql/data/large_pl.msgpack.gz +0 -0
data/lib/freql/data/large_pt.msgpack.gz +0 -0
data/lib/freql/data/large_ru.msgpack.gz +0 -0
data/lib/freql/data/large_sv.msgpack.gz +0 -0
data/lib/freql/data/large_uk.msgpack.gz +0 -0
data/lib/freql/data/large_zh.msgpack.gz +0 -0
data/lib/freql/data/small_ar.msgpack.gz +0 -0
data/lib/freql/data/small_bg.msgpack.gz +0 -0
data/lib/freql/data/small_bn.msgpack.gz +0 -0
data/lib/freql/data/small_ca.msgpack.gz +0 -0
data/lib/freql/data/small_cs.msgpack.gz +0 -0
data/lib/freql/data/small_da.msgpack.gz +0 -0
data/lib/freql/data/small_de.msgpack.gz +0 -0
data/lib/freql/data/small_el.msgpack.gz +0 -0
data/lib/freql/data/small_en.msgpack.gz +0 -0
data/lib/freql/data/small_es.msgpack.gz +0 -0
data/lib/freql/data/small_fa.msgpack.gz +0 -0
data/lib/freql/data/small_fi.msgpack.gz +0 -0
data/lib/freql/data/small_fil.msgpack.gz +0 -0
data/lib/freql/data/small_fr.msgpack.gz +0 -0
data/lib/freql/data/small_he.msgpack.gz +0 -0
data/lib/freql/data/small_hi.msgpack.gz +0 -0
data/lib/freql/data/small_hu.msgpack.gz +0 -0
data/lib/freql/data/small_id.msgpack.gz +0 -0
data/lib/freql/data/small_is.msgpack.gz +0 -0
data/lib/freql/data/small_it.msgpack.gz +0 -0
data/lib/freql/data/small_ja.msgpack.gz +0 -0
data/lib/freql/data/small_ko.msgpack.gz +0 -0
data/lib/freql/data/small_lt.msgpack.gz +0 -0
data/lib/freql/data/small_lv.msgpack.gz +0 -0
data/lib/freql/data/small_mk.msgpack.gz +0 -0
data/lib/freql/data/small_ms.msgpack.gz +0 -0
data/lib/freql/data/small_nb.msgpack.gz +0 -0
data/lib/freql/data/small_nl.msgpack.gz +0 -0
data/lib/freql/data/small_pl.msgpack.gz +0 -0
data/lib/freql/data/small_pt.msgpack.gz +0 -0
data/lib/freql/data/small_ro.msgpack.gz +0 -0
data/lib/freql/data/small_ru.msgpack.gz +0 -0
data/lib/freql/data/small_sh.msgpack.gz +0 -0
data/lib/freql/data/small_sk.msgpack.gz +0 -0
data/lib/freql/data/small_sl.msgpack.gz +0 -0
data/lib/freql/data/small_sv.msgpack.gz +0 -0
data/lib/freql/data/small_ta.msgpack.gz +0 -0
data/lib/freql/data/small_tr.msgpack.gz +0 -0
data/lib/freql/data/small_uk.msgpack.gz +0 -0
data/lib/freql/data/small_ur.msgpack.gz +0 -0
data/lib/freql/data/small_vi.msgpack.gz +0 -0
data/lib/freql/data/small_zh.msgpack.gz +0 -0
data/lib/freql/fpbw.rb +28 -0
data/lib/freql/fpmw.rb +41 -0
data/lib/freql/fq.rb +30 -0
data/lib/freql/rank.rb +39 -0
data/lib/freql/version.rb +5 -0
data/lib/freql/words.rb +44 -0
data/lib/freql/zipf.rb +36 -0
data/lib/freql.rb +13 -0
data/sig/freql.rbs +4 -0
metadata +152 -0

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA256:
+  metadata.gz: 74eb8d2a60c57b3a8c329845e419c4b72acb74ca49d4acb251dd8866eecdeea9
+  data.tar.gz: 795c8b37f96c0decd55a4f3eeb01a4ab14daa33bd02389646606814977cb354d
+SHA512:
+  metadata.gz: 3e6c5abbef36d4b09fbc02d2a9b0bbf33ecf8344e519d2074c01b479bfafdf24ab3bf2da0a75f495f2209f46a5b48d000d6a29e9b585452994ed46f79429ae51
+  data.tar.gz: 2db83bc03f6044e5ae181bc6cafd0f3ad19e18586a39f7461aef5aaa1d388eef4378802158978241600c1b7012ad4a90b6a168e6509d59408729c25e9938d377

data/.rspec ADDED Viewed

@@ -0,0 +1,3 @@
+--format documentation
+--color
+--require spec_helper

data/CHANGELOG.md ADDED Viewed

@@ -0,0 +1,5 @@
+## [Unreleased]
+## [0.1.0] - 2023-01-08
+- Initial release

data/Gemfile ADDED Viewed

@@ -0,0 +1,11 @@
+# frozen_string_literal: true
+source "https://rubygems.org"
+# Specify your gem's dependencies in freql.gemspec
+gemspec
+gem "rake", "~> 13.0"
+gem 'pry', '~> 0.14.1'
+gem "rspec", "~> 3.0"

data/Gemfile.lock ADDED Viewed

@@ -0,0 +1,42 @@
+PATH
+  remote: .
+  specs:
+    freql (0.1.0)
+      msgpack (~> 1.5, >= 1.5.1)
+GEM
+  remote: https://rubygems.org/
+  specs:
+    coderay (1.1.3)
+    diff-lcs (1.5.0)
+    method_source (1.0.0)
+    msgpack (1.7.1)
+    pry (0.14.2)
+      coderay (~> 1.1)
+      method_source (~> 1.0)
+    rake (13.0.6)
+    rspec (3.12.0)
+      rspec-core (~> 3.12.0)
+      rspec-expectations (~> 3.12.0)
+      rspec-mocks (~> 3.12.0)
+    rspec-core (3.12.0)
+      rspec-support (~> 3.12.0)
+    rspec-expectations (3.12.2)
+      diff-lcs (>= 1.2.0, < 2.0)
+      rspec-support (~> 3.12.0)
+    rspec-mocks (3.12.2)
+      diff-lcs (>= 1.2.0, < 2.0)
+      rspec-support (~> 3.12.0)
+    rspec-support (3.12.0)
+PLATFORMS
+  x86_64-linux
+DEPENDENCIES
+  freql!
+  pry (~> 0.14.1)
+  rake (~> 13.0)
+  rspec (~> 3.0)
+BUNDLED WITH
+   2.4.3

data/LICENSE.txt ADDED Viewed

@@ -0,0 +1,23 @@
+The MIT License (MIT)
+Copyright (c) 2023 opsaaaaa
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+Any data under `lib/freql/data` comes from the wordfreq repsoitory and is therefore subject to its License

data/README.md ADDED Viewed

@@ -0,0 +1,89 @@
+# Freql
+*aka. ((word) Freqency Lang/Lib)*
+A library for handling word/token frequencies.
+## features
+- convert cb and fpmw to zipf and other units.
+- basic lookup for word frequencies in various languages.
+- token counting tool
+- tools for building word/token frequency datasets from custom sources
+## Lets educate you about word frequency units real quick.
+| name | desciption | range | examples |
+| --- | --- | --- | --- |
+| fq | frequency represented as a proportion between 0 and 1. Occurrence count divided by total words/tokens | 0 to 1 | 0.053(the) 0.00000001(trella) |
+| fpmw | frequency per million words. | 1 million to 0 | 53703(the) 0.01(trella) |
+| fpbw | frequency per billion words. | 1 billion to 0 | nah |
+| word rank | Frequency rank relative to all the other words within your corpus. | 1+n | the #1 |
+| zipf scale | Its log10 of frequency per billion words. Named after the American linguist George Kingsley Zipf | 9.0 to 0.0(or less technically) | 1.01(the) to 7.73(trella) |
+| cb | Its a word frequency from of logarithmic centibel scale. Basically zipf optimized for storage. | 0 to -900(or less) | -127(the) -799(trella) |
+| name | Advantages | Disadvantages |
+| --- | --- | --- |
+| fq | simple | ...lots and lots of decimals |
+| fpmw | Its straight forward to calculated and understand | Its not easy for humans to compare. for some words its less than 1 |
+| fpbw | words arn't going to be less than one. | nobody uses it |
+| zipf scale | Easy for humans to compare. | requires decimals for accuracy |
+| cb | we can safely represent it as a positive integer without sacrificing significant accuracy | less human readable than zipf |
+### Where does cb come from?
+cb is the word frequency unit used by our inital dataset pulled from the wordfreq program.
+https://github.com/rspeer/wordfreq
+> 0 cB represents a word that occurs with probability 1, so it is the only
+> word in the data (this of course doesn't happen). -200 cB represents a
+> word that occurs once per 100 tokens, -300 cB represents a word that
+> occurs once per 1000 tokens, and so on.
+Its very similar to zipf, but with a different scale and 0 point.
+Its always less than 0, so rare values cant cross 0.
+and numbers are larger, so you dont need decimils for reasonable accuracy.
+You can easilly save them as positive integers.
+In the wordfreq program they 'bin' the data to reduce the file size further.
+`array[ bin[ "words", ...], ... ]`
+The index of the bin represents the positive cb frequency value.
+you end up with a lot of leading empty bins, but after that it gets really efficient.
+## Installation
+Install the gem and add to the application's Gemfile by executing:
+    $ bundle add freql
+If bundler is not being used to manage dependencies, install the gem by executing:
+    $ gem install freql
+## Usage
+## Development
+After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
+To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and the created tag, and push the `.gem` file to [rubygems.org](https://rubygems.org).
+## Contributing
+Bug reports and pull requests are welcome on GitHub at https://github.com/[USERNAME]/freql.
+## License
+The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
+## Credits
+- I read the code from the python wordfq program. https://github.com/rspeer/wordfq

data/Rakefile ADDED Viewed

@@ -0,0 +1,8 @@
+# frozen_string_literal: true
+require "bundler/gem_tasks"
+require "rspec/core/rake_task"
+RSpec::Core::RakeTask.new(:spec)
+task default: :spec

data/freql.gemspec ADDED Viewed

@@ -0,0 +1,35 @@
+# frozen_string_literal: true
+require_relative "lib/freql/version"
+Gem::Specification.new do |spec|
+  spec.name = "freql"
+  spec.version = Freql::VERSION
+  spec.authors = ["opsaaaaa"]
+  spec.email = ["sean@ferney.org"]
+  spec.summary = "A library for handling word/token freqencies units."
+  spec.description = "Right now all we do is convert fpmw to zipf and other units."
+  spec.homepage = "https://github.com/opsaaaaa/freql"
+  spec.license = "MIT"
+  spec.required_ruby_version = ">= 2.6.0"
+  spec.metadata["homepage_uri"] = spec.homepage
+  spec.metadata["source_code_uri"] = spec.homepage
+  spec.metadata["changelog_uri"] = "https://github.com/opsaaaaa/freql/blob/master/CHANGELOG.md"
+  # Specify which files should be added to the gem when it is released.
+  # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
+  spec.files = Dir.chdir(__dir__) do
+    `git ls-files -z`.split("\x0").reject do |f|
+      (f == __FILE__) || f.match(%r{\A(?:(?:bin|test|spec|features)/|\.(?:git|circleci)|appveyor)})
+    end
+  end
+  spec.bindir = "exe"
+  spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
+  spec.require_paths = ["lib"]
+  spec.add_runtime_dependency 'msgpack', '~> 1.5', '>= 1.5.1'
+end

data/lib/freql/bindata.rb ADDED Viewed

@@ -0,0 +1,55 @@
+require 'zlib'
+require 'msgpack'
+module Freql
+  module BinData
+    # BinData is a tool for compressing key=>integer pair data
+    # into an array where the place of the index stores the integer value.
+    # {'three' => 3} > [[],[],[],['three']]
+    # its use in the [wordfq](https://github.com/rspeer/wordfq) program
+    # to compress word frequency data.
+    LANG_FILE_PATH = "lib/freql/data/%s_%s.msgpack.gz"
+    class << self
+      def pack hash_data, size: nil
+        size ||= hash_data.values.max
+        bin_data = Array.new(size+1) { [] }
+        hash_data.each do |key, val|
+          bin_data[val.to_i] << key
+        end
+        return bin_data
+      end
+      def unpack bin_data
+        hash_data = {}
+        bin_data.each.with_index do |group, val|
+          group.each do |key|
+            hash_data[key] = val
+          end
+        end
+        return hash_data
+      end
+      def read_lang lang = :en, size: :small, &block
+        Zlib::GzipReader.open(LANG_FILE_PATH % [size,lang]) do |gz|
+          # The first item in the language data contains version and format information
+          # Im choosing to ignore that information for now.
+          # The rest of the data is word frequency bindata
+          block.call MessagePack.unpack(gz.read)[1..]
+        end
+      end
+      def read_and_unpack_lang lang = :en, size: :small, &block
+        read_lang(lang, size: size) {|data| block.call( unpack(data) ) }
+      end
+    end
+  end
+end

data/lib/freql/cb.rb ADDED Viewed

@@ -0,0 +1,51 @@
+module Freql
+  module CB
+    # CB is a word frequency from of logarithmic centibel scale.
+    # practical range -127(the) to -799
+    # actuall range is 0 to -900(or less)
+    # cb is the word frequency unit used the dataset from the python wordfq program.
+    # https://github.com/rspeer/wordfq
+    # > 0 cB represents a word that occurs with probability 1, so it is the only
+    # > word in the data (this of course doesn't happen). -200 cB represents a
+    # > word that occurs once per 100 tokens, -300 cB represents a word that
+    # > occurs once per 1000 tokens, and so on.
+    # Advantages
+    # - Its very similar to zipf, but with a different scale and 0 point.
+    # - Its really good for storage sizes.
+    # - Its always less than 0, so rare values cant cross 0.
+    # - and numbers are larger, so you dont need decimils for reasonable accuracy.
+    # - you can easilly save them as positive integers.
+    # Disadvantages
+    # - its less human readable.
+    # In the wordfq program they 'bin' the data to reduce the file size further.
+    # array[ bin[ "words", ...], ... ]
+    # The index of the bin represents the positive frequency value.
+    # you end up with a lot of leading empty bins, but after that it gets really effecient.
+    class << self
+      def cb_to_fq cb
+        10.00 ** (-cb.abs / 100.00)
+      end
+      def cb_to_fpmw cb
+        (10.00 ** (-cb.abs / 100.00)) * 1000000
+      end
+      def cb_to_fpbw cb
+        (10.00 ** (-cb.abs / 100.00)) * 1000000000
+      end
+      def cb_to_zipf cb
+        (-cb.abs + 900.00) / 100.00
+      end
+      def calc_cb occurances, total
+        Math.log10(occurances / total.to_f) * 100.0
+      end
+    end
+  end
+end

data/lib/freql/counter.rb ADDED Viewed

@@ -0,0 +1,73 @@
+module Freql
+  class Counter
+    # Calculate word/token frequencies from various inputs provided.
+    attr :total
+    attr :tokens
+    def initialize tokens: {}, total: 0
+      @tokens = tokens
+      @total = total
+    end
+    def add_array source
+      @total += source.length
+      source.each do |token|
+        add_token(token)
+      end
+      self
+    end
+    def add_words source
+      add_matches(source, /\w+/)
+    end
+    def add_single_token token
+      @total += 1
+      add_token(token)
+    end
+    def add_matches source, pattern
+      add_array(source.scan(pattern))
+    end
+    def add_inflated_pairs source, size = 2
+      out = []
+      for x in 0..(source.length-size) do
+        out << source[x...x+size]
+      end
+      add_array(out)
+    end
+    def compute_cb
+      @total = @total.to_f
+      tokens.transform_values {|count| CB.calc_cb(count,@total)}
+    end
+    def compute_zipf
+      @total = @total.to_f
+      tokens.transform_values {|count| ZipF.calc_zipf(count,@total)}
+    end
+    def compute_bindata
+      BinData.pack(tokens.transform_values {|count| CB.calc_cb(count,@total).abs.round})
+    end
+    private
+    def add_token token
+      if @tokens.has_key?(token)
+        @tokens[token] += 1
+      else
+        @tokens[token] = 1
+      end
+    end
+  end
+end

data/lib/freql/data/_chinese_mapping.msgpack.gz ADDED Viewed

Binary file