word-bloom 0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lang/dutch.lang +0 -0
- data/lang/english.lang +0 -0
- data/lang/farsi.lang +0 -0
- data/lang/french.lang +0 -0
- data/lang/german.lang +0 -0
- data/lang/italian.lang +0 -0
- data/lang/pinyin.lang +0 -0
- data/lang/portuguese.lang +0 -0
- data/lang/russian.lang +0 -0
- data/lang/spanish.lang +0 -0
- data/lang/swedish.lang +0 -0
- data/lib/bitfield.rb +60 -0
- data/lib/bloominsimple.rb +48 -0
- data/lib/word-bloom.rb +16 -0
- data/lib/word-bloom/filter-builder.rb +15 -0
- data/lib/word-bloom/quality.rb +52 -0
- data/lib/word-bloom/scorer.rb +108 -0
- metadata +87 -0
data/lang/dutch.lang
ADDED
Binary file
|
data/lang/english.lang
ADDED
Binary file
|
data/lang/farsi.lang
ADDED
Binary file
|
data/lang/french.lang
ADDED
Binary file
|
data/lang/german.lang
ADDED
Binary file
|
data/lang/italian.lang
ADDED
Binary file
|
data/lang/pinyin.lang
ADDED
Binary file
|
Binary file
|
data/lang/russian.lang
ADDED
Binary file
|
data/lang/spanish.lang
ADDED
Binary file
|
data/lang/swedish.lang
ADDED
Binary file
|
data/lib/bitfield.rb
ADDED
@@ -0,0 +1,60 @@
|
|
1
|
+
# NAME: BitField
|
2
|
+
# AUTHOR: Peter Cooper
|
3
|
+
# LICENSE: MIT ( http://www.opensource.org/licenses/mit-license.php )
|
4
|
+
# COPYRIGHT: (c) 2007 Peter Cooper (http://www.petercooper.co.uk/)
|
5
|
+
|
6
|
+
class BitField
|
7
|
+
attr_reader :size
|
8
|
+
attr_accessor :field
|
9
|
+
include Enumerable
|
10
|
+
|
11
|
+
ELEMENT_WIDTH = 32
|
12
|
+
|
13
|
+
def initialize(size)
|
14
|
+
@size = size
|
15
|
+
@field = Array.new(((size - 1) / ELEMENT_WIDTH) + 1, 0)
|
16
|
+
end
|
17
|
+
|
18
|
+
# Set a bit (1/0)
|
19
|
+
def []=(position, value)
|
20
|
+
value == 1 ? @field[position / ELEMENT_WIDTH] |= 1 << (position % ELEMENT_WIDTH) : @field[position / ELEMENT_WIDTH] ^= 1 << (position % ELEMENT_WIDTH)
|
21
|
+
end
|
22
|
+
|
23
|
+
# Read a bit (1/0)
|
24
|
+
def [](position)
|
25
|
+
@field[position / ELEMENT_WIDTH] & 1 << (position % ELEMENT_WIDTH) > 0 ? 1 : 0
|
26
|
+
end
|
27
|
+
|
28
|
+
# Iterate over each bit
|
29
|
+
def each(&block)
|
30
|
+
@size.times { |position| yield self[position] }
|
31
|
+
end
|
32
|
+
|
33
|
+
# Returns the field as a string like "0101010100111100," etc.
|
34
|
+
def to_s
|
35
|
+
inject("") { |a, b| a + b.to_s }
|
36
|
+
end
|
37
|
+
|
38
|
+
# Returns the bitwise intersection with another BitField - pads smaller with
|
39
|
+
# 0s
|
40
|
+
def &(other)
|
41
|
+
if self.size < other.size
|
42
|
+
return other & self
|
43
|
+
end
|
44
|
+
|
45
|
+
skip = self.size - other.size
|
46
|
+
result = BitField.new(self.size)
|
47
|
+
prefix = [0] * skip
|
48
|
+
rest = (self.field[skip..-1]).zip(other.field).map do |left, right|
|
49
|
+
left & right
|
50
|
+
end
|
51
|
+
result.field = prefix + rest
|
52
|
+
return result
|
53
|
+
end
|
54
|
+
|
55
|
+
# Returns the total number of bits that are set
|
56
|
+
# (The technique used here is about 6 times faster than using each or inject direct on the bitfield)
|
57
|
+
def total_set
|
58
|
+
@field.inject(0) { |a, byte| a += byte & 1 and byte >>= 1 until byte == 0; a }
|
59
|
+
end
|
60
|
+
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
# NAME: BloominSimple
|
2
|
+
# AUTHOR: Peter Cooper
|
3
|
+
# LICENSE: MIT ( http://www.opensource.org/licenses/mit-license.php )
|
4
|
+
# COPYRIGHT: (c) 2007 Peter Cooper
|
5
|
+
|
6
|
+
require 'bitfield'
|
7
|
+
|
8
|
+
class BloominSimple
|
9
|
+
attr_accessor :bitfield, :hasher
|
10
|
+
|
11
|
+
def initialize(bitsize, &block)
|
12
|
+
@bitfield = BitField.new(bitsize)
|
13
|
+
@size = bitsize
|
14
|
+
@hasher = block || lambda do |word|
|
15
|
+
word = word.downcase.strip
|
16
|
+
[h1 = word.sum, h2 = word.hash, h2 + h1 ** 3]
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
# Add item to the filter
|
21
|
+
def add(item)
|
22
|
+
@hasher[item].each { |hi| @bitfield[hi % @size] = 1 }
|
23
|
+
end
|
24
|
+
|
25
|
+
# Find out if the filter possibly contains the supplied item
|
26
|
+
def includes?(item)
|
27
|
+
@hasher[item].each { |hi| return false unless @bitfield[hi % @size] == 1 } and true
|
28
|
+
end
|
29
|
+
|
30
|
+
# Allows comparison between two filters. Returns number of same bits.
|
31
|
+
def &(other)
|
32
|
+
raise "Wrong sizes" if self.bitfield.size != other.bitfield.size
|
33
|
+
return (self.bitfield & other.bitfield).total_set
|
34
|
+
end
|
35
|
+
|
36
|
+
# Dumps the bitfield for a bloom filter for storage
|
37
|
+
def dump
|
38
|
+
[@size, *@bitfield.field].pack("I*")
|
39
|
+
end
|
40
|
+
|
41
|
+
# Creates a new bloom filter object from a stored dump (hasher has to be resent though for additions)
|
42
|
+
def self.from_dump(data, &block)
|
43
|
+
data = data.unpack("I*")
|
44
|
+
temp = new(data[0], &block)
|
45
|
+
temp.bitfield.field = data[1..-1]
|
46
|
+
temp
|
47
|
+
end
|
48
|
+
end
|
data/lib/word-bloom.rb
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
require 'bloominsimple'
|
2
|
+
require 'digest/sha1'
|
3
|
+
|
4
|
+
class WordBloom
|
5
|
+
HASHER = lambda do |item|
|
6
|
+
begin
|
7
|
+
item = item.encode("UTF-16LE", :invalid => :replace, :undef => :replace, :replace => "").encode("UTF-8")
|
8
|
+
Digest::SHA1.digest(item.downcase.strip).unpack("VV")
|
9
|
+
rescue ArgumentError => ex
|
10
|
+
p __ENCODING__
|
11
|
+
p ex.message, item
|
12
|
+
raise
|
13
|
+
end
|
14
|
+
end
|
15
|
+
LANGUAGE_DIR_PATH = File.expand_path("../../lang", __FILE__)
|
16
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
class WordBloom
|
2
|
+
class FilterBuilder
|
3
|
+
BITFIELD_WIDTH = 2_000_000
|
4
|
+
|
5
|
+
def initialize(source_path)
|
6
|
+
@filename = source_path
|
7
|
+
end
|
8
|
+
|
9
|
+
def filter_from_dictionary
|
10
|
+
filter = BloominSimple.new(BITFIELD_WIDTH, &HASHER)
|
11
|
+
File.open(@filename).each { |word| filter.add(word) }
|
12
|
+
filter
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
require 'word-bloom/scorer'
|
2
|
+
|
3
|
+
class WordBloom
|
4
|
+
class Quality
|
5
|
+
def self.build_all
|
6
|
+
qual = self.new
|
7
|
+
qual.languages = Scorer::all_languages
|
8
|
+
qual.build_metrics
|
9
|
+
qual
|
10
|
+
end
|
11
|
+
|
12
|
+
def initialize
|
13
|
+
@correlations = {}
|
14
|
+
@languages = []
|
15
|
+
end
|
16
|
+
|
17
|
+
attr_accessor :languages
|
18
|
+
attr_reader :correlations
|
19
|
+
|
20
|
+
def build_metrics
|
21
|
+
require 'word-bloom/filter-builder'
|
22
|
+
|
23
|
+
languages.each do |lang|
|
24
|
+
Scorer::load_filter(lang)
|
25
|
+
end
|
26
|
+
|
27
|
+
languages.each_with_index do |lang, index|
|
28
|
+
languages.drop(index + 1).each do |other_lang|
|
29
|
+
@correlations[[lang, other_lang]] =
|
30
|
+
(Scorer::filter_for(lang) & Scorer::filter_for(other_lang)).to_f / FilterBuilder::BITFIELD_WIDTH
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
def to_s
|
36
|
+
width = languages.map{|lang| lang.to_s.length}.max + 2
|
37
|
+
col_sep = " "
|
38
|
+
([([" " * width] + languages.map{|lang| lang.to_s.rjust(width)}).join(col_sep)] +
|
39
|
+
languages.map do |left|
|
40
|
+
([left.to_s.ljust(width)] +
|
41
|
+
languages.map do |right|
|
42
|
+
if num = @correlations[[left, right]] || @correlations[[right, left]]
|
43
|
+
"%#{width}f" % num
|
44
|
+
else
|
45
|
+
"-" * width
|
46
|
+
end
|
47
|
+
end
|
48
|
+
).join(col_sep)
|
49
|
+
end).join("\n")
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
@@ -0,0 +1,108 @@
|
|
1
|
+
require 'word-bloom'
|
2
|
+
|
3
|
+
class WordBloom
|
4
|
+
class Scorer
|
5
|
+
@@filters = {}
|
6
|
+
@@all_languages = nil
|
7
|
+
|
8
|
+
def self.load_filter(name)
|
9
|
+
@@filters[name] ||=
|
10
|
+
begin
|
11
|
+
File.open(File.join(LANGUAGE_DIR_PATH, "#{name}.lang"), 'rb') do |file|
|
12
|
+
BloominSimple.from_dump(file.read, &HASHER)
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def self.filter_for(language)
|
18
|
+
@@filters[language]
|
19
|
+
end
|
20
|
+
|
21
|
+
def self.all_languages
|
22
|
+
@@all_languages ||= Dir.entries(LANGUAGE_DIR_PATH).grep(/\.lang$/).map do |filename|
|
23
|
+
filename.sub(/\.lang$/,'').to_sym
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def self.loaded_with(*languages)
|
28
|
+
scorer = self.new
|
29
|
+
if [:all] == languages
|
30
|
+
scorer.add_all_languages
|
31
|
+
else
|
32
|
+
languages.each do |language|
|
33
|
+
scorer.add_language(language)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
return scorer
|
37
|
+
end
|
38
|
+
|
39
|
+
def initialize()
|
40
|
+
@languages = {}
|
41
|
+
@language_weights = Hash.new(1.0)
|
42
|
+
@language_weights[:russian] = 0.8
|
43
|
+
end
|
44
|
+
|
45
|
+
def add_language(name, weight = nil)
|
46
|
+
self.class.load_filter(name)
|
47
|
+
@languages[name] = true
|
48
|
+
@language_weights[name] = weight unless weight.nil?
|
49
|
+
end
|
50
|
+
|
51
|
+
def add_all_languages
|
52
|
+
self.class.all_languages.each do |language|
|
53
|
+
add_language(language)
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
def confidence(considered, results)
|
58
|
+
top_results = results.values.sort
|
59
|
+
best = top_results[-1]
|
60
|
+
rest = top_results[0..-2].inject{|number, sum| sum + number}
|
61
|
+
|
62
|
+
return OPTIMISM * best - rest
|
63
|
+
end
|
64
|
+
|
65
|
+
OPTIMISM = 3.5
|
66
|
+
MIN_CONFIDENCE = 15
|
67
|
+
|
68
|
+
def apply_weights(results)
|
69
|
+
results.keys.each do |lang|
|
70
|
+
results[lang] *= @language_weights[lang]
|
71
|
+
end
|
72
|
+
results
|
73
|
+
end
|
74
|
+
|
75
|
+
# Very inefficient method for now.. but still beats the non-Bloom
|
76
|
+
# alternatives.
|
77
|
+
# Change to better bit comparison technique later..
|
78
|
+
def process_text(text)
|
79
|
+
results = Hash.new(0)
|
80
|
+
word_count = 0
|
81
|
+
text.split(/\s+/).each do |word|
|
82
|
+
word = word.downcase
|
83
|
+
next if /^\d*$/ =~ word
|
84
|
+
@languages.keys.each do |lang|
|
85
|
+
if @@filters[lang].includes?(word)
|
86
|
+
results[lang] += 1
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
# Every now and then check to see if we have a really convincing result.. if so, exit early.
|
91
|
+
if word_count % 4 == 0 && results.size > 1
|
92
|
+
#break if confidence(word_count + 1, results) > MIN_CONFIDENCE
|
93
|
+
end
|
94
|
+
|
95
|
+
word_count += 1
|
96
|
+
#break if word_count > 100
|
97
|
+
end
|
98
|
+
apply_weights(results)
|
99
|
+
rescue => ex
|
100
|
+
p ex, ex.backtrace
|
101
|
+
nil
|
102
|
+
end
|
103
|
+
|
104
|
+
def language(text)
|
105
|
+
process_text(text).max { |a,b| a[1] <=> b[1] }.first rescue nil
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
metadata
ADDED
@@ -0,0 +1,87 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: word-bloom
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: '0.1'
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Peter Cooper
|
9
|
+
- Judson Lester
|
10
|
+
autorequire:
|
11
|
+
bindir: bin
|
12
|
+
cert_chain: []
|
13
|
+
date: 2012-09-08 00:00:00.000000000 Z
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: corundum
|
17
|
+
requirement: &71753810 !ruby/object:Gem::Requirement
|
18
|
+
none: false
|
19
|
+
requirements:
|
20
|
+
- - ~>
|
21
|
+
- !ruby/object:Gem::Version
|
22
|
+
version: 0.0.1
|
23
|
+
type: :development
|
24
|
+
prerelease: false
|
25
|
+
version_requirements: *71753810
|
26
|
+
description: ! ' Guesses the natural language of a text sample based on bloom filter
|
27
|
+
matches of words in the text. Fast, reasonably accurate.
|
28
|
+
|
29
|
+
'
|
30
|
+
email:
|
31
|
+
- ''
|
32
|
+
- nyarly@gmail.com
|
33
|
+
executables: []
|
34
|
+
extensions: []
|
35
|
+
extra_rdoc_files: []
|
36
|
+
files:
|
37
|
+
- lib/bloominsimple.rb
|
38
|
+
- lib/word-bloom.rb
|
39
|
+
- lib/word-bloom/filter-builder.rb
|
40
|
+
- lib/word-bloom/scorer.rb
|
41
|
+
- lib/word-bloom/quality.rb
|
42
|
+
- lib/bitfield.rb
|
43
|
+
- lang/pinyin.lang
|
44
|
+
- lang/dutch.lang
|
45
|
+
- lang/french.lang
|
46
|
+
- lang/swedish.lang
|
47
|
+
- lang/russian.lang
|
48
|
+
- lang/german.lang
|
49
|
+
- lang/farsi.lang
|
50
|
+
- lang/italian.lang
|
51
|
+
- lang/portuguese.lang
|
52
|
+
- lang/english.lang
|
53
|
+
- lang/spanish.lang
|
54
|
+
homepage: http://nyarly.github.com/word-bloom/
|
55
|
+
licenses:
|
56
|
+
- MIT
|
57
|
+
post_install_message: Thanks again to Peter Cooper - JL
|
58
|
+
rdoc_options:
|
59
|
+
- --inline-source
|
60
|
+
- --main
|
61
|
+
- doc/README
|
62
|
+
- --title
|
63
|
+
- word-bloom-0.1 RDoc
|
64
|
+
require_paths:
|
65
|
+
- lib/
|
66
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
67
|
+
none: false
|
68
|
+
requirements:
|
69
|
+
- - ! '>='
|
70
|
+
- !ruby/object:Gem::Version
|
71
|
+
version: '0'
|
72
|
+
segments:
|
73
|
+
- 0
|
74
|
+
hash: 379118287
|
75
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
76
|
+
none: false
|
77
|
+
requirements:
|
78
|
+
- - ! '>='
|
79
|
+
- !ruby/object:Gem::Version
|
80
|
+
version: '0'
|
81
|
+
requirements: []
|
82
|
+
rubyforge_project: word-bloom
|
83
|
+
rubygems_version: 1.8.15
|
84
|
+
signing_key:
|
85
|
+
specification_version: 3
|
86
|
+
summary: Natural language guessing for text samples
|
87
|
+
test_files: []
|