word-bloom 0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/lang/dutch.lang +0 -0
- data/lang/english.lang +0 -0
- data/lang/farsi.lang +0 -0
- data/lang/french.lang +0 -0
- data/lang/german.lang +0 -0
- data/lang/italian.lang +0 -0
- data/lang/pinyin.lang +0 -0
- data/lang/portuguese.lang +0 -0
- data/lang/russian.lang +0 -0
- data/lang/spanish.lang +0 -0
- data/lang/swedish.lang +0 -0
- data/lib/bitfield.rb +60 -0
- data/lib/bloominsimple.rb +48 -0
- data/lib/word-bloom.rb +16 -0
- data/lib/word-bloom/filter-builder.rb +15 -0
- data/lib/word-bloom/quality.rb +52 -0
- data/lib/word-bloom/scorer.rb +108 -0
- metadata +87 -0
data/lang/dutch.lang
ADDED
Binary file
|
data/lang/english.lang
ADDED
Binary file
|
data/lang/farsi.lang
ADDED
Binary file
|
data/lang/french.lang
ADDED
Binary file
|
data/lang/german.lang
ADDED
Binary file
|
data/lang/italian.lang
ADDED
Binary file
|
data/lang/pinyin.lang
ADDED
Binary file
|
Binary file
|
data/lang/russian.lang
ADDED
Binary file
|
data/lang/spanish.lang
ADDED
Binary file
|
data/lang/swedish.lang
ADDED
Binary file
|
data/lib/bitfield.rb
ADDED
@@ -0,0 +1,60 @@
|
|
1
|
+
# NAME: BitField
|
2
|
+
# AUTHOR: Peter Cooper
|
3
|
+
# LICENSE: MIT ( http://www.opensource.org/licenses/mit-license.php )
|
4
|
+
# COPYRIGHT: (c) 2007 Peter Cooper (http://www.petercooper.co.uk/)
|
5
|
+
|
6
|
+
class BitField
|
7
|
+
attr_reader :size
|
8
|
+
attr_accessor :field
|
9
|
+
include Enumerable
|
10
|
+
|
11
|
+
ELEMENT_WIDTH = 32
|
12
|
+
|
13
|
+
def initialize(size)
|
14
|
+
@size = size
|
15
|
+
@field = Array.new(((size - 1) / ELEMENT_WIDTH) + 1, 0)
|
16
|
+
end
|
17
|
+
|
18
|
+
# Set a bit (1/0)
|
19
|
+
def []=(position, value)
|
20
|
+
value == 1 ? @field[position / ELEMENT_WIDTH] |= 1 << (position % ELEMENT_WIDTH) : @field[position / ELEMENT_WIDTH] ^= 1 << (position % ELEMENT_WIDTH)
|
21
|
+
end
|
22
|
+
|
23
|
+
# Read a bit (1/0)
|
24
|
+
def [](position)
|
25
|
+
@field[position / ELEMENT_WIDTH] & 1 << (position % ELEMENT_WIDTH) > 0 ? 1 : 0
|
26
|
+
end
|
27
|
+
|
28
|
+
# Iterate over each bit
|
29
|
+
def each(&block)
|
30
|
+
@size.times { |position| yield self[position] }
|
31
|
+
end
|
32
|
+
|
33
|
+
# Returns the field as a string like "0101010100111100," etc.
|
34
|
+
def to_s
|
35
|
+
inject("") { |a, b| a + b.to_s }
|
36
|
+
end
|
37
|
+
|
38
|
+
# Returns the bitwise intersection with another BitField - pads smaller with
|
39
|
+
# 0s
|
40
|
+
def &(other)
|
41
|
+
if self.size < other.size
|
42
|
+
return other & self
|
43
|
+
end
|
44
|
+
|
45
|
+
skip = self.size - other.size
|
46
|
+
result = BitField.new(self.size)
|
47
|
+
prefix = [0] * skip
|
48
|
+
rest = (self.field[skip..-1]).zip(other.field).map do |left, right|
|
49
|
+
left & right
|
50
|
+
end
|
51
|
+
result.field = prefix + rest
|
52
|
+
return result
|
53
|
+
end
|
54
|
+
|
55
|
+
# Returns the total number of bits that are set
|
56
|
+
# (The technique used here is about 6 times faster than using each or inject direct on the bitfield)
|
57
|
+
def total_set
|
58
|
+
@field.inject(0) { |a, byte| a += byte & 1 and byte >>= 1 until byte == 0; a }
|
59
|
+
end
|
60
|
+
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
# NAME: BloominSimple
|
2
|
+
# AUTHOR: Peter Cooper
|
3
|
+
# LICENSE: MIT ( http://www.opensource.org/licenses/mit-license.php )
|
4
|
+
# COPYRIGHT: (c) 2007 Peter Cooper
|
5
|
+
|
6
|
+
require 'bitfield'
|
7
|
+
|
8
|
+
class BloominSimple
|
9
|
+
attr_accessor :bitfield, :hasher
|
10
|
+
|
11
|
+
def initialize(bitsize, &block)
|
12
|
+
@bitfield = BitField.new(bitsize)
|
13
|
+
@size = bitsize
|
14
|
+
@hasher = block || lambda do |word|
|
15
|
+
word = word.downcase.strip
|
16
|
+
[h1 = word.sum, h2 = word.hash, h2 + h1 ** 3]
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
# Add item to the filter
|
21
|
+
def add(item)
|
22
|
+
@hasher[item].each { |hi| @bitfield[hi % @size] = 1 }
|
23
|
+
end
|
24
|
+
|
25
|
+
# Find out if the filter possibly contains the supplied item
|
26
|
+
def includes?(item)
|
27
|
+
@hasher[item].each { |hi| return false unless @bitfield[hi % @size] == 1 } and true
|
28
|
+
end
|
29
|
+
|
30
|
+
# Allows comparison between two filters. Returns number of same bits.
|
31
|
+
def &(other)
|
32
|
+
raise "Wrong sizes" if self.bitfield.size != other.bitfield.size
|
33
|
+
return (self.bitfield & other.bitfield).total_set
|
34
|
+
end
|
35
|
+
|
36
|
+
# Dumps the bitfield for a bloom filter for storage
|
37
|
+
def dump
|
38
|
+
[@size, *@bitfield.field].pack("I*")
|
39
|
+
end
|
40
|
+
|
41
|
+
# Creates a new bloom filter object from a stored dump (hasher has to be resent though for additions)
|
42
|
+
def self.from_dump(data, &block)
|
43
|
+
data = data.unpack("I*")
|
44
|
+
temp = new(data[0], &block)
|
45
|
+
temp.bitfield.field = data[1..-1]
|
46
|
+
temp
|
47
|
+
end
|
48
|
+
end
|
data/lib/word-bloom.rb
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
require 'bloominsimple'
|
2
|
+
require 'digest/sha1'
|
3
|
+
|
4
|
+
class WordBloom
|
5
|
+
HASHER = lambda do |item|
|
6
|
+
begin
|
7
|
+
item = item.encode("UTF-16LE", :invalid => :replace, :undef => :replace, :replace => "").encode("UTF-8")
|
8
|
+
Digest::SHA1.digest(item.downcase.strip).unpack("VV")
|
9
|
+
rescue ArgumentError => ex
|
10
|
+
p __ENCODING__
|
11
|
+
p ex.message, item
|
12
|
+
raise
|
13
|
+
end
|
14
|
+
end
|
15
|
+
LANGUAGE_DIR_PATH = File.expand_path("../../lang", __FILE__)
|
16
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
class WordBloom
|
2
|
+
class FilterBuilder
|
3
|
+
BITFIELD_WIDTH = 2_000_000
|
4
|
+
|
5
|
+
def initialize(source_path)
|
6
|
+
@filename = source_path
|
7
|
+
end
|
8
|
+
|
9
|
+
def filter_from_dictionary
|
10
|
+
filter = BloominSimple.new(BITFIELD_WIDTH, &HASHER)
|
11
|
+
File.open(@filename).each { |word| filter.add(word) }
|
12
|
+
filter
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
require 'word-bloom/scorer'
|
2
|
+
|
3
|
+
class WordBloom
|
4
|
+
class Quality
|
5
|
+
def self.build_all
|
6
|
+
qual = self.new
|
7
|
+
qual.languages = Scorer::all_languages
|
8
|
+
qual.build_metrics
|
9
|
+
qual
|
10
|
+
end
|
11
|
+
|
12
|
+
def initialize
|
13
|
+
@correlations = {}
|
14
|
+
@languages = []
|
15
|
+
end
|
16
|
+
|
17
|
+
attr_accessor :languages
|
18
|
+
attr_reader :correlations
|
19
|
+
|
20
|
+
def build_metrics
|
21
|
+
require 'word-bloom/filter-builder'
|
22
|
+
|
23
|
+
languages.each do |lang|
|
24
|
+
Scorer::load_filter(lang)
|
25
|
+
end
|
26
|
+
|
27
|
+
languages.each_with_index do |lang, index|
|
28
|
+
languages.drop(index + 1).each do |other_lang|
|
29
|
+
@correlations[[lang, other_lang]] =
|
30
|
+
(Scorer::filter_for(lang) & Scorer::filter_for(other_lang)).to_f / FilterBuilder::BITFIELD_WIDTH
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
def to_s
|
36
|
+
width = languages.map{|lang| lang.to_s.length}.max + 2
|
37
|
+
col_sep = " "
|
38
|
+
([([" " * width] + languages.map{|lang| lang.to_s.rjust(width)}).join(col_sep)] +
|
39
|
+
languages.map do |left|
|
40
|
+
([left.to_s.ljust(width)] +
|
41
|
+
languages.map do |right|
|
42
|
+
if num = @correlations[[left, right]] || @correlations[[right, left]]
|
43
|
+
"%#{width}f" % num
|
44
|
+
else
|
45
|
+
"-" * width
|
46
|
+
end
|
47
|
+
end
|
48
|
+
).join(col_sep)
|
49
|
+
end).join("\n")
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
@@ -0,0 +1,108 @@
|
|
1
|
+
require 'word-bloom'
|
2
|
+
|
3
|
+
class WordBloom
|
4
|
+
class Scorer
|
5
|
+
@@filters = {}
|
6
|
+
@@all_languages = nil
|
7
|
+
|
8
|
+
def self.load_filter(name)
|
9
|
+
@@filters[name] ||=
|
10
|
+
begin
|
11
|
+
File.open(File.join(LANGUAGE_DIR_PATH, "#{name}.lang"), 'rb') do |file|
|
12
|
+
BloominSimple.from_dump(file.read, &HASHER)
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def self.filter_for(language)
|
18
|
+
@@filters[language]
|
19
|
+
end
|
20
|
+
|
21
|
+
def self.all_languages
|
22
|
+
@@all_languages ||= Dir.entries(LANGUAGE_DIR_PATH).grep(/\.lang$/).map do |filename|
|
23
|
+
filename.sub(/\.lang$/,'').to_sym
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def self.loaded_with(*languages)
|
28
|
+
scorer = self.new
|
29
|
+
if [:all] == languages
|
30
|
+
scorer.add_all_languages
|
31
|
+
else
|
32
|
+
languages.each do |language|
|
33
|
+
scorer.add_language(language)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
return scorer
|
37
|
+
end
|
38
|
+
|
39
|
+
def initialize()
|
40
|
+
@languages = {}
|
41
|
+
@language_weights = Hash.new(1.0)
|
42
|
+
@language_weights[:russian] = 0.8
|
43
|
+
end
|
44
|
+
|
45
|
+
def add_language(name, weight = nil)
|
46
|
+
self.class.load_filter(name)
|
47
|
+
@languages[name] = true
|
48
|
+
@language_weights[name] = weight unless weight.nil?
|
49
|
+
end
|
50
|
+
|
51
|
+
def add_all_languages
|
52
|
+
self.class.all_languages.each do |language|
|
53
|
+
add_language(language)
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
def confidence(considered, results)
|
58
|
+
top_results = results.values.sort
|
59
|
+
best = top_results[-1]
|
60
|
+
rest = top_results[0..-2].inject{|number, sum| sum + number}
|
61
|
+
|
62
|
+
return OPTIMISM * best - rest
|
63
|
+
end
|
64
|
+
|
65
|
+
OPTIMISM = 3.5
|
66
|
+
MIN_CONFIDENCE = 15
|
67
|
+
|
68
|
+
def apply_weights(results)
|
69
|
+
results.keys.each do |lang|
|
70
|
+
results[lang] *= @language_weights[lang]
|
71
|
+
end
|
72
|
+
results
|
73
|
+
end
|
74
|
+
|
75
|
+
# Very inefficient method for now.. but still beats the non-Bloom
|
76
|
+
# alternatives.
|
77
|
+
# Change to better bit comparison technique later..
|
78
|
+
def process_text(text)
|
79
|
+
results = Hash.new(0)
|
80
|
+
word_count = 0
|
81
|
+
text.split(/\s+/).each do |word|
|
82
|
+
word = word.downcase
|
83
|
+
next if /^\d*$/ =~ word
|
84
|
+
@languages.keys.each do |lang|
|
85
|
+
if @@filters[lang].includes?(word)
|
86
|
+
results[lang] += 1
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
# Every now and then check to see if we have a really convincing result.. if so, exit early.
|
91
|
+
if word_count % 4 == 0 && results.size > 1
|
92
|
+
#break if confidence(word_count + 1, results) > MIN_CONFIDENCE
|
93
|
+
end
|
94
|
+
|
95
|
+
word_count += 1
|
96
|
+
#break if word_count > 100
|
97
|
+
end
|
98
|
+
apply_weights(results)
|
99
|
+
rescue => ex
|
100
|
+
p ex, ex.backtrace
|
101
|
+
nil
|
102
|
+
end
|
103
|
+
|
104
|
+
def language(text)
|
105
|
+
process_text(text).max { |a,b| a[1] <=> b[1] }.first rescue nil
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
metadata
ADDED
@@ -0,0 +1,87 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: word-bloom
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: '0.1'
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Peter Cooper
|
9
|
+
- Judson Lester
|
10
|
+
autorequire:
|
11
|
+
bindir: bin
|
12
|
+
cert_chain: []
|
13
|
+
date: 2012-09-08 00:00:00.000000000 Z
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: corundum
|
17
|
+
requirement: &71753810 !ruby/object:Gem::Requirement
|
18
|
+
none: false
|
19
|
+
requirements:
|
20
|
+
- - ~>
|
21
|
+
- !ruby/object:Gem::Version
|
22
|
+
version: 0.0.1
|
23
|
+
type: :development
|
24
|
+
prerelease: false
|
25
|
+
version_requirements: *71753810
|
26
|
+
description: ! ' Guesses the natural language of a text sample based on bloom filter
|
27
|
+
matches of words in the text. Fast, reasonably accurate.
|
28
|
+
|
29
|
+
'
|
30
|
+
email:
|
31
|
+
- ''
|
32
|
+
- nyarly@gmail.com
|
33
|
+
executables: []
|
34
|
+
extensions: []
|
35
|
+
extra_rdoc_files: []
|
36
|
+
files:
|
37
|
+
- lib/bloominsimple.rb
|
38
|
+
- lib/word-bloom.rb
|
39
|
+
- lib/word-bloom/filter-builder.rb
|
40
|
+
- lib/word-bloom/scorer.rb
|
41
|
+
- lib/word-bloom/quality.rb
|
42
|
+
- lib/bitfield.rb
|
43
|
+
- lang/pinyin.lang
|
44
|
+
- lang/dutch.lang
|
45
|
+
- lang/french.lang
|
46
|
+
- lang/swedish.lang
|
47
|
+
- lang/russian.lang
|
48
|
+
- lang/german.lang
|
49
|
+
- lang/farsi.lang
|
50
|
+
- lang/italian.lang
|
51
|
+
- lang/portuguese.lang
|
52
|
+
- lang/english.lang
|
53
|
+
- lang/spanish.lang
|
54
|
+
homepage: http://nyarly.github.com/word-bloom/
|
55
|
+
licenses:
|
56
|
+
- MIT
|
57
|
+
post_install_message: Thanks again to Peter Cooper - JL
|
58
|
+
rdoc_options:
|
59
|
+
- --inline-source
|
60
|
+
- --main
|
61
|
+
- doc/README
|
62
|
+
- --title
|
63
|
+
- word-bloom-0.1 RDoc
|
64
|
+
require_paths:
|
65
|
+
- lib/
|
66
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
67
|
+
none: false
|
68
|
+
requirements:
|
69
|
+
- - ! '>='
|
70
|
+
- !ruby/object:Gem::Version
|
71
|
+
version: '0'
|
72
|
+
segments:
|
73
|
+
- 0
|
74
|
+
hash: 379118287
|
75
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
76
|
+
none: false
|
77
|
+
requirements:
|
78
|
+
- - ! '>='
|
79
|
+
- !ruby/object:Gem::Version
|
80
|
+
version: '0'
|
81
|
+
requirements: []
|
82
|
+
rubyforge_project: word-bloom
|
83
|
+
rubygems_version: 1.8.15
|
84
|
+
signing_key:
|
85
|
+
specification_version: 3
|
86
|
+
summary: Natural language guessing for text samples
|
87
|
+
test_files: []
|