freql 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/CHANGELOG.md +5 -0
- data/Gemfile +11 -0
- data/Gemfile.lock +42 -0
- data/LICENSE.txt +23 -0
- data/README.md +89 -0
- data/Rakefile +8 -0
- data/freql.gemspec +35 -0
- data/lib/freql/bindata.rb +55 -0
- data/lib/freql/cb.rb +51 -0
- data/lib/freql/counter.rb +73 -0
- data/lib/freql/data/_chinese_mapping.msgpack.gz +0 -0
- data/lib/freql/data/jieba_zh.txt +38811 -0
- data/lib/freql/data/jieba_zh_orig.txt +349046 -0
- data/lib/freql/data/large_ar.msgpack.gz +0 -0
- data/lib/freql/data/large_bn.msgpack.gz +0 -0
- data/lib/freql/data/large_ca.msgpack.gz +0 -0
- data/lib/freql/data/large_cs.msgpack.gz +0 -0
- data/lib/freql/data/large_de.msgpack.gz +0 -0
- data/lib/freql/data/large_en.msgpack.gz +0 -0
- data/lib/freql/data/large_es.msgpack.gz +0 -0
- data/lib/freql/data/large_fi.msgpack.gz +0 -0
- data/lib/freql/data/large_fr.msgpack.gz +0 -0
- data/lib/freql/data/large_he.msgpack.gz +0 -0
- data/lib/freql/data/large_it.msgpack.gz +0 -0
- data/lib/freql/data/large_ja.msgpack.gz +0 -0
- data/lib/freql/data/large_mk.msgpack.gz +0 -0
- data/lib/freql/data/large_nb.msgpack.gz +0 -0
- data/lib/freql/data/large_nl.msgpack.gz +0 -0
- data/lib/freql/data/large_pl.msgpack.gz +0 -0
- data/lib/freql/data/large_pt.msgpack.gz +0 -0
- data/lib/freql/data/large_ru.msgpack.gz +0 -0
- data/lib/freql/data/large_sv.msgpack.gz +0 -0
- data/lib/freql/data/large_uk.msgpack.gz +0 -0
- data/lib/freql/data/large_zh.msgpack.gz +0 -0
- data/lib/freql/data/small_ar.msgpack.gz +0 -0
- data/lib/freql/data/small_bg.msgpack.gz +0 -0
- data/lib/freql/data/small_bn.msgpack.gz +0 -0
- data/lib/freql/data/small_ca.msgpack.gz +0 -0
- data/lib/freql/data/small_cs.msgpack.gz +0 -0
- data/lib/freql/data/small_da.msgpack.gz +0 -0
- data/lib/freql/data/small_de.msgpack.gz +0 -0
- data/lib/freql/data/small_el.msgpack.gz +0 -0
- data/lib/freql/data/small_en.msgpack.gz +0 -0
- data/lib/freql/data/small_es.msgpack.gz +0 -0
- data/lib/freql/data/small_fa.msgpack.gz +0 -0
- data/lib/freql/data/small_fi.msgpack.gz +0 -0
- data/lib/freql/data/small_fil.msgpack.gz +0 -0
- data/lib/freql/data/small_fr.msgpack.gz +0 -0
- data/lib/freql/data/small_he.msgpack.gz +0 -0
- data/lib/freql/data/small_hi.msgpack.gz +0 -0
- data/lib/freql/data/small_hu.msgpack.gz +0 -0
- data/lib/freql/data/small_id.msgpack.gz +0 -0
- data/lib/freql/data/small_is.msgpack.gz +0 -0
- data/lib/freql/data/small_it.msgpack.gz +0 -0
- data/lib/freql/data/small_ja.msgpack.gz +0 -0
- data/lib/freql/data/small_ko.msgpack.gz +0 -0
- data/lib/freql/data/small_lt.msgpack.gz +0 -0
- data/lib/freql/data/small_lv.msgpack.gz +0 -0
- data/lib/freql/data/small_mk.msgpack.gz +0 -0
- data/lib/freql/data/small_ms.msgpack.gz +0 -0
- data/lib/freql/data/small_nb.msgpack.gz +0 -0
- data/lib/freql/data/small_nl.msgpack.gz +0 -0
- data/lib/freql/data/small_pl.msgpack.gz +0 -0
- data/lib/freql/data/small_pt.msgpack.gz +0 -0
- data/lib/freql/data/small_ro.msgpack.gz +0 -0
- data/lib/freql/data/small_ru.msgpack.gz +0 -0
- data/lib/freql/data/small_sh.msgpack.gz +0 -0
- data/lib/freql/data/small_sk.msgpack.gz +0 -0
- data/lib/freql/data/small_sl.msgpack.gz +0 -0
- data/lib/freql/data/small_sv.msgpack.gz +0 -0
- data/lib/freql/data/small_ta.msgpack.gz +0 -0
- data/lib/freql/data/small_tr.msgpack.gz +0 -0
- data/lib/freql/data/small_uk.msgpack.gz +0 -0
- data/lib/freql/data/small_ur.msgpack.gz +0 -0
- data/lib/freql/data/small_vi.msgpack.gz +0 -0
- data/lib/freql/data/small_zh.msgpack.gz +0 -0
- data/lib/freql/fpbw.rb +28 -0
- data/lib/freql/fpmw.rb +41 -0
- data/lib/freql/fq.rb +30 -0
- data/lib/freql/rank.rb +39 -0
- data/lib/freql/version.rb +5 -0
- data/lib/freql/words.rb +44 -0
- data/lib/freql/zipf.rb +36 -0
- data/lib/freql.rb +13 -0
- data/sig/freql.rbs +4 -0
- metadata +152 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: 74eb8d2a60c57b3a8c329845e419c4b72acb74ca49d4acb251dd8866eecdeea9
|
|
4
|
+
data.tar.gz: 795c8b37f96c0decd55a4f3eeb01a4ab14daa33bd02389646606814977cb354d
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: 3e6c5abbef36d4b09fbc02d2a9b0bbf33ecf8344e519d2074c01b479bfafdf24ab3bf2da0a75f495f2209f46a5b48d000d6a29e9b585452994ed46f79429ae51
|
|
7
|
+
data.tar.gz: 2db83bc03f6044e5ae181bc6cafd0f3ad19e18586a39f7461aef5aaa1d388eef4378802158978241600c1b7012ad4a90b6a168e6509d59408729c25e9938d377
|
data/.rspec
ADDED
data/CHANGELOG.md
ADDED
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
PATH
|
|
2
|
+
remote: .
|
|
3
|
+
specs:
|
|
4
|
+
freql (0.1.0)
|
|
5
|
+
msgpack (~> 1.5, >= 1.5.1)
|
|
6
|
+
|
|
7
|
+
GEM
|
|
8
|
+
remote: https://rubygems.org/
|
|
9
|
+
specs:
|
|
10
|
+
coderay (1.1.3)
|
|
11
|
+
diff-lcs (1.5.0)
|
|
12
|
+
method_source (1.0.0)
|
|
13
|
+
msgpack (1.7.1)
|
|
14
|
+
pry (0.14.2)
|
|
15
|
+
coderay (~> 1.1)
|
|
16
|
+
method_source (~> 1.0)
|
|
17
|
+
rake (13.0.6)
|
|
18
|
+
rspec (3.12.0)
|
|
19
|
+
rspec-core (~> 3.12.0)
|
|
20
|
+
rspec-expectations (~> 3.12.0)
|
|
21
|
+
rspec-mocks (~> 3.12.0)
|
|
22
|
+
rspec-core (3.12.0)
|
|
23
|
+
rspec-support (~> 3.12.0)
|
|
24
|
+
rspec-expectations (3.12.2)
|
|
25
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
|
26
|
+
rspec-support (~> 3.12.0)
|
|
27
|
+
rspec-mocks (3.12.2)
|
|
28
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
|
29
|
+
rspec-support (~> 3.12.0)
|
|
30
|
+
rspec-support (3.12.0)
|
|
31
|
+
|
|
32
|
+
PLATFORMS
|
|
33
|
+
x86_64-linux
|
|
34
|
+
|
|
35
|
+
DEPENDENCIES
|
|
36
|
+
freql!
|
|
37
|
+
pry (~> 0.14.1)
|
|
38
|
+
rake (~> 13.0)
|
|
39
|
+
rspec (~> 3.0)
|
|
40
|
+
|
|
41
|
+
BUNDLED WITH
|
|
42
|
+
2.4.3
|
data/LICENSE.txt
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
The MIT License (MIT)
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2023 opsaaaaa
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
|
13
|
+
all copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
21
|
+
THE SOFTWARE.
|
|
22
|
+
|
|
23
|
+
Any data under `lib/freql/data` comes from the wordfreq repsoitory and is therefore subject to its License
|
data/README.md
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
# Freql
|
|
2
|
+
|
|
3
|
+
*aka. ((word) Freqency Lang/Lib)*
|
|
4
|
+
|
|
5
|
+
A library for handling word/token frequencies.
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
## features
|
|
9
|
+
- convert cb and fpmw to zipf and other units.
|
|
10
|
+
- basic lookup for word frequencies in various languages.
|
|
11
|
+
- token counting tool
|
|
12
|
+
- tools for building word/token frequency datasets from custom sources
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
## Lets educate you about word frequency units real quick.
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
| name | desciption | range | examples |
|
|
19
|
+
| --- | --- | --- | --- |
|
|
20
|
+
| fq | frequency represented as a proportion between 0 and 1. Occurrence count divided by total words/tokens | 0 to 1 | 0.053(the) 0.00000001(trella) |
|
|
21
|
+
| fpmw | frequency per million words. | 1 million to 0 | 53703(the) 0.01(trella) |
|
|
22
|
+
| fpbw | frequency per billion words. | 1 billion to 0 | nah |
|
|
23
|
+
| word rank | Frequency rank relative to all the other words within your corpus. | 1+n | the #1 |
|
|
24
|
+
| zipf scale | Its log10 of frequency per billion words. Named after the American linguist George Kingsley Zipf | 9.0 to 0.0(or less technically) | 1.01(the) to 7.73(trella) |
|
|
25
|
+
| cb | Its a word frequency from of logarithmic centibel scale. Basically zipf optimized for storage. | 0 to -900(or less) | -127(the) -799(trella) |
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
| name | Advantages | Disadvantages |
|
|
29
|
+
| --- | --- | --- |
|
|
30
|
+
| fq | simple | ...lots and lots of decimals |
|
|
31
|
+
| fpmw | Its straight forward to calculated and understand | Its not easy for humans to compare. for some words its less than 1 |
|
|
32
|
+
| fpbw | words arn't going to be less than one. | nobody uses it |
|
|
33
|
+
| zipf scale | Easy for humans to compare. | requires decimals for accuracy |
|
|
34
|
+
| cb | we can safely represent it as a positive integer without sacrificing significant accuracy | less human readable than zipf |
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
### Where does cb come from?
|
|
38
|
+
|
|
39
|
+
cb is the word frequency unit used by our inital dataset pulled from the wordfreq program.
|
|
40
|
+
https://github.com/rspeer/wordfreq
|
|
41
|
+
|
|
42
|
+
> 0 cB represents a word that occurs with probability 1, so it is the only
|
|
43
|
+
> word in the data (this of course doesn't happen). -200 cB represents a
|
|
44
|
+
> word that occurs once per 100 tokens, -300 cB represents a word that
|
|
45
|
+
> occurs once per 1000 tokens, and so on.
|
|
46
|
+
|
|
47
|
+
Its very similar to zipf, but with a different scale and 0 point.
|
|
48
|
+
Its always less than 0, so rare values cant cross 0.
|
|
49
|
+
and numbers are larger, so you dont need decimils for reasonable accuracy.
|
|
50
|
+
You can easilly save them as positive integers.
|
|
51
|
+
|
|
52
|
+
In the wordfreq program they 'bin' the data to reduce the file size further.
|
|
53
|
+
`array[ bin[ "words", ...], ... ]`
|
|
54
|
+
The index of the bin represents the positive cb frequency value.
|
|
55
|
+
you end up with a lot of leading empty bins, but after that it gets really efficient.
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
## Installation
|
|
59
|
+
|
|
60
|
+
Install the gem and add to the application's Gemfile by executing:
|
|
61
|
+
|
|
62
|
+
$ bundle add freql
|
|
63
|
+
|
|
64
|
+
If bundler is not being used to manage dependencies, install the gem by executing:
|
|
65
|
+
|
|
66
|
+
$ gem install freql
|
|
67
|
+
|
|
68
|
+
## Usage
|
|
69
|
+
|
|
70
|
+
## Development
|
|
71
|
+
|
|
72
|
+
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
|
73
|
+
|
|
74
|
+
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and the created tag, and push the `.gem` file to [rubygems.org](https://rubygems.org).
|
|
75
|
+
|
|
76
|
+
## Contributing
|
|
77
|
+
|
|
78
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/[USERNAME]/freql.
|
|
79
|
+
|
|
80
|
+
## License
|
|
81
|
+
|
|
82
|
+
The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
|
|
83
|
+
|
|
84
|
+
## Credits
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
- I read the code from the python wordfq program. https://github.com/rspeer/wordfq
|
|
88
|
+
|
|
89
|
+
|
data/Rakefile
ADDED
data/freql.gemspec
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "lib/freql/version"
|
|
4
|
+
|
|
5
|
+
Gem::Specification.new do |spec|
|
|
6
|
+
spec.name = "freql"
|
|
7
|
+
spec.version = Freql::VERSION
|
|
8
|
+
spec.authors = ["opsaaaaa"]
|
|
9
|
+
spec.email = ["sean@ferney.org"]
|
|
10
|
+
|
|
11
|
+
spec.summary = "A library for handling word/token freqencies units."
|
|
12
|
+
|
|
13
|
+
spec.description = "Right now all we do is convert fpmw to zipf and other units."
|
|
14
|
+
spec.homepage = "https://github.com/opsaaaaa/freql"
|
|
15
|
+
spec.license = "MIT"
|
|
16
|
+
spec.required_ruby_version = ">= 2.6.0"
|
|
17
|
+
|
|
18
|
+
spec.metadata["homepage_uri"] = spec.homepage
|
|
19
|
+
spec.metadata["source_code_uri"] = spec.homepage
|
|
20
|
+
spec.metadata["changelog_uri"] = "https://github.com/opsaaaaa/freql/blob/master/CHANGELOG.md"
|
|
21
|
+
|
|
22
|
+
# Specify which files should be added to the gem when it is released.
|
|
23
|
+
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
|
24
|
+
spec.files = Dir.chdir(__dir__) do
|
|
25
|
+
`git ls-files -z`.split("\x0").reject do |f|
|
|
26
|
+
(f == __FILE__) || f.match(%r{\A(?:(?:bin|test|spec|features)/|\.(?:git|circleci)|appveyor)})
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
spec.bindir = "exe"
|
|
30
|
+
spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
|
|
31
|
+
spec.require_paths = ["lib"]
|
|
32
|
+
|
|
33
|
+
spec.add_runtime_dependency 'msgpack', '~> 1.5', '>= 1.5.1'
|
|
34
|
+
|
|
35
|
+
end
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
require 'zlib'
|
|
2
|
+
require 'msgpack'
|
|
3
|
+
|
|
4
|
+
module Freql
|
|
5
|
+
module BinData
|
|
6
|
+
|
|
7
|
+
# BinData is a tool for compressing key=>integer pair data
|
|
8
|
+
# into an array where the place of the index stores the integer value.
|
|
9
|
+
|
|
10
|
+
# {'three' => 3} > [[],[],[],['three']]
|
|
11
|
+
|
|
12
|
+
# its use in the [wordfq](https://github.com/rspeer/wordfq) program
|
|
13
|
+
# to compress word frequency data.
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
LANG_FILE_PATH = "lib/freql/data/%s_%s.msgpack.gz"
|
|
17
|
+
|
|
18
|
+
class << self
|
|
19
|
+
|
|
20
|
+
def pack hash_data, size: nil
|
|
21
|
+
size ||= hash_data.values.max
|
|
22
|
+
bin_data = Array.new(size+1) { [] }
|
|
23
|
+
|
|
24
|
+
hash_data.each do |key, val|
|
|
25
|
+
bin_data[val.to_i] << key
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
return bin_data
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def unpack bin_data
|
|
32
|
+
hash_data = {}
|
|
33
|
+
bin_data.each.with_index do |group, val|
|
|
34
|
+
group.each do |key|
|
|
35
|
+
hash_data[key] = val
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
return hash_data
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def read_lang lang = :en, size: :small, &block
|
|
42
|
+
Zlib::GzipReader.open(LANG_FILE_PATH % [size,lang]) do |gz|
|
|
43
|
+
# The first item in the language data contains version and format information
|
|
44
|
+
# Im choosing to ignore that information for now.
|
|
45
|
+
# The rest of the data is word frequency bindata
|
|
46
|
+
block.call MessagePack.unpack(gz.read)[1..]
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def read_and_unpack_lang lang = :en, size: :small, &block
|
|
51
|
+
read_lang(lang, size: size) {|data| block.call( unpack(data) ) }
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
end
|
data/lib/freql/cb.rb
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
module Freql
|
|
2
|
+
module CB
|
|
3
|
+
# CB is a word frequency from of logarithmic centibel scale.
|
|
4
|
+
|
|
5
|
+
# practical range -127(the) to -799
|
|
6
|
+
# actuall range is 0 to -900(or less)
|
|
7
|
+
|
|
8
|
+
# cb is the word frequency unit used the dataset from the python wordfq program.
|
|
9
|
+
# https://github.com/rspeer/wordfq
|
|
10
|
+
|
|
11
|
+
# > 0 cB represents a word that occurs with probability 1, so it is the only
|
|
12
|
+
# > word in the data (this of course doesn't happen). -200 cB represents a
|
|
13
|
+
# > word that occurs once per 100 tokens, -300 cB represents a word that
|
|
14
|
+
# > occurs once per 1000 tokens, and so on.
|
|
15
|
+
|
|
16
|
+
# Advantages
|
|
17
|
+
# - Its very similar to zipf, but with a different scale and 0 point.
|
|
18
|
+
# - Its really good for storage sizes.
|
|
19
|
+
# - Its always less than 0, so rare values cant cross 0.
|
|
20
|
+
# - and numbers are larger, so you dont need decimils for reasonable accuracy.
|
|
21
|
+
# - you can easilly save them as positive integers.
|
|
22
|
+
|
|
23
|
+
# Disadvantages
|
|
24
|
+
# - its less human readable.
|
|
25
|
+
|
|
26
|
+
# In the wordfq program they 'bin' the data to reduce the file size further.
|
|
27
|
+
# array[ bin[ "words", ...], ... ]
|
|
28
|
+
# The index of the bin represents the positive frequency value.
|
|
29
|
+
# you end up with a lot of leading empty bins, but after that it gets really effecient.
|
|
30
|
+
|
|
31
|
+
class << self
|
|
32
|
+
def cb_to_fq cb
|
|
33
|
+
10.00 ** (-cb.abs / 100.00)
|
|
34
|
+
end
|
|
35
|
+
def cb_to_fpmw cb
|
|
36
|
+
(10.00 ** (-cb.abs / 100.00)) * 1000000
|
|
37
|
+
end
|
|
38
|
+
def cb_to_fpbw cb
|
|
39
|
+
(10.00 ** (-cb.abs / 100.00)) * 1000000000
|
|
40
|
+
end
|
|
41
|
+
def cb_to_zipf cb
|
|
42
|
+
(-cb.abs + 900.00) / 100.00
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def calc_cb occurances, total
|
|
46
|
+
Math.log10(occurances / total.to_f) * 100.0
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
|
|
2
|
+
module Freql
|
|
3
|
+
class Counter
|
|
4
|
+
|
|
5
|
+
# Calculate word/token frequencies from various inputs provided.
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
attr :total
|
|
9
|
+
attr :tokens
|
|
10
|
+
|
|
11
|
+
def initialize tokens: {}, total: 0
|
|
12
|
+
@tokens = tokens
|
|
13
|
+
@total = total
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def add_array source
|
|
17
|
+
@total += source.length
|
|
18
|
+
source.each do |token|
|
|
19
|
+
add_token(token)
|
|
20
|
+
end
|
|
21
|
+
self
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def add_words source
|
|
25
|
+
add_matches(source, /\w+/)
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def add_single_token token
|
|
29
|
+
@total += 1
|
|
30
|
+
add_token(token)
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def add_matches source, pattern
|
|
34
|
+
add_array(source.scan(pattern))
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def add_inflated_pairs source, size = 2
|
|
38
|
+
out = []
|
|
39
|
+
for x in 0..(source.length-size) do
|
|
40
|
+
out << source[x...x+size]
|
|
41
|
+
end
|
|
42
|
+
add_array(out)
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def compute_cb
|
|
46
|
+
@total = @total.to_f
|
|
47
|
+
tokens.transform_values {|count| CB.calc_cb(count,@total)}
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def compute_zipf
|
|
51
|
+
@total = @total.to_f
|
|
52
|
+
tokens.transform_values {|count| ZipF.calc_zipf(count,@total)}
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
def compute_bindata
|
|
56
|
+
BinData.pack(tokens.transform_values {|count| CB.calc_cb(count,@total).abs.round})
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
private
|
|
61
|
+
|
|
62
|
+
def add_token token
|
|
63
|
+
if @tokens.has_key?(token)
|
|
64
|
+
@tokens[token] += 1
|
|
65
|
+
else
|
|
66
|
+
@tokens[token] = 1
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
end
|
|
73
|
+
end
|
|
Binary file
|