word-bloom 0.1

Sign up to get free protection for your applications and to get access to all the features.
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
@@ -0,0 +1,60 @@
1
+ # NAME: BitField
2
+ # AUTHOR: Peter Cooper
3
+ # LICENSE: MIT ( http://www.opensource.org/licenses/mit-license.php )
4
+ # COPYRIGHT: (c) 2007 Peter Cooper (http://www.petercooper.co.uk/)
5
+
6
+ class BitField
7
+ attr_reader :size
8
+ attr_accessor :field
9
+ include Enumerable
10
+
11
+ ELEMENT_WIDTH = 32
12
+
13
+ def initialize(size)
14
+ @size = size
15
+ @field = Array.new(((size - 1) / ELEMENT_WIDTH) + 1, 0)
16
+ end
17
+
18
+ # Set a bit (1/0)
19
+ def []=(position, value)
20
+ value == 1 ? @field[position / ELEMENT_WIDTH] |= 1 << (position % ELEMENT_WIDTH) : @field[position / ELEMENT_WIDTH] ^= 1 << (position % ELEMENT_WIDTH)
21
+ end
22
+
23
+ # Read a bit (1/0)
24
+ def [](position)
25
+ @field[position / ELEMENT_WIDTH] & 1 << (position % ELEMENT_WIDTH) > 0 ? 1 : 0
26
+ end
27
+
28
+ # Iterate over each bit
29
+ def each(&block)
30
+ @size.times { |position| yield self[position] }
31
+ end
32
+
33
+ # Returns the field as a string like "0101010100111100," etc.
34
+ def to_s
35
+ inject("") { |a, b| a + b.to_s }
36
+ end
37
+
38
+ # Returns the bitwise intersection with another BitField - pads smaller with
39
+ # 0s
40
+ def &(other)
41
+ if self.size < other.size
42
+ return other & self
43
+ end
44
+
45
+ skip = self.size - other.size
46
+ result = BitField.new(self.size)
47
+ prefix = [0] * skip
48
+ rest = (self.field[skip..-1]).zip(other.field).map do |left, right|
49
+ left & right
50
+ end
51
+ result.field = prefix + rest
52
+ return result
53
+ end
54
+
55
+ # Returns the total number of bits that are set
56
+ # (The technique used here is about 6 times faster than using each or inject direct on the bitfield)
57
+ def total_set
58
+ @field.inject(0) { |a, byte| a += byte & 1 and byte >>= 1 until byte == 0; a }
59
+ end
60
+ end
@@ -0,0 +1,48 @@
1
+ # NAME: BloominSimple
2
+ # AUTHOR: Peter Cooper
3
+ # LICENSE: MIT ( http://www.opensource.org/licenses/mit-license.php )
4
+ # COPYRIGHT: (c) 2007 Peter Cooper
5
+
6
+ require 'bitfield'
7
+
8
+ class BloominSimple
9
+ attr_accessor :bitfield, :hasher
10
+
11
+ def initialize(bitsize, &block)
12
+ @bitfield = BitField.new(bitsize)
13
+ @size = bitsize
14
+ @hasher = block || lambda do |word|
15
+ word = word.downcase.strip
16
+ [h1 = word.sum, h2 = word.hash, h2 + h1 ** 3]
17
+ end
18
+ end
19
+
20
+ # Add item to the filter
21
+ def add(item)
22
+ @hasher[item].each { |hi| @bitfield[hi % @size] = 1 }
23
+ end
24
+
25
+ # Find out if the filter possibly contains the supplied item
26
+ def includes?(item)
27
+ @hasher[item].each { |hi| return false unless @bitfield[hi % @size] == 1 } and true
28
+ end
29
+
30
+ # Allows comparison between two filters. Returns number of same bits.
31
+ def &(other)
32
+ raise "Wrong sizes" if self.bitfield.size != other.bitfield.size
33
+ return (self.bitfield & other.bitfield).total_set
34
+ end
35
+
36
+ # Dumps the bitfield for a bloom filter for storage
37
+ def dump
38
+ [@size, *@bitfield.field].pack("I*")
39
+ end
40
+
41
+ # Creates a new bloom filter object from a stored dump (hasher has to be resent though for additions)
42
+ def self.from_dump(data, &block)
43
+ data = data.unpack("I*")
44
+ temp = new(data[0], &block)
45
+ temp.bitfield.field = data[1..-1]
46
+ temp
47
+ end
48
+ end
@@ -0,0 +1,16 @@
1
+ require 'bloominsimple'
2
+ require 'digest/sha1'
3
+
4
+ class WordBloom
5
+ HASHER = lambda do |item|
6
+ begin
7
+ item = item.encode("UTF-16LE", :invalid => :replace, :undef => :replace, :replace => "").encode("UTF-8")
8
+ Digest::SHA1.digest(item.downcase.strip).unpack("VV")
9
+ rescue ArgumentError => ex
10
+ p __ENCODING__
11
+ p ex.message, item
12
+ raise
13
+ end
14
+ end
15
+ LANGUAGE_DIR_PATH = File.expand_path("../../lang", __FILE__)
16
+ end
@@ -0,0 +1,15 @@
1
+ class WordBloom
2
+ class FilterBuilder
3
+ BITFIELD_WIDTH = 2_000_000
4
+
5
+ def initialize(source_path)
6
+ @filename = source_path
7
+ end
8
+
9
+ def filter_from_dictionary
10
+ filter = BloominSimple.new(BITFIELD_WIDTH, &HASHER)
11
+ File.open(@filename).each { |word| filter.add(word) }
12
+ filter
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,52 @@
1
+ require 'word-bloom/scorer'
2
+
3
+ class WordBloom
4
+ class Quality
5
+ def self.build_all
6
+ qual = self.new
7
+ qual.languages = Scorer::all_languages
8
+ qual.build_metrics
9
+ qual
10
+ end
11
+
12
+ def initialize
13
+ @correlations = {}
14
+ @languages = []
15
+ end
16
+
17
+ attr_accessor :languages
18
+ attr_reader :correlations
19
+
20
+ def build_metrics
21
+ require 'word-bloom/filter-builder'
22
+
23
+ languages.each do |lang|
24
+ Scorer::load_filter(lang)
25
+ end
26
+
27
+ languages.each_with_index do |lang, index|
28
+ languages.drop(index + 1).each do |other_lang|
29
+ @correlations[[lang, other_lang]] =
30
+ (Scorer::filter_for(lang) & Scorer::filter_for(other_lang)).to_f / FilterBuilder::BITFIELD_WIDTH
31
+ end
32
+ end
33
+ end
34
+
35
+ def to_s
36
+ width = languages.map{|lang| lang.to_s.length}.max + 2
37
+ col_sep = " "
38
+ ([([" " * width] + languages.map{|lang| lang.to_s.rjust(width)}).join(col_sep)] +
39
+ languages.map do |left|
40
+ ([left.to_s.ljust(width)] +
41
+ languages.map do |right|
42
+ if num = @correlations[[left, right]] || @correlations[[right, left]]
43
+ "%#{width}f" % num
44
+ else
45
+ "-" * width
46
+ end
47
+ end
48
+ ).join(col_sep)
49
+ end).join("\n")
50
+ end
51
+ end
52
+ end
@@ -0,0 +1,108 @@
1
+ require 'word-bloom'
2
+
3
+ class WordBloom
4
+ class Scorer
5
+ @@filters = {}
6
+ @@all_languages = nil
7
+
8
+ def self.load_filter(name)
9
+ @@filters[name] ||=
10
+ begin
11
+ File.open(File.join(LANGUAGE_DIR_PATH, "#{name}.lang"), 'rb') do |file|
12
+ BloominSimple.from_dump(file.read, &HASHER)
13
+ end
14
+ end
15
+ end
16
+
17
+ def self.filter_for(language)
18
+ @@filters[language]
19
+ end
20
+
21
+ def self.all_languages
22
+ @@all_languages ||= Dir.entries(LANGUAGE_DIR_PATH).grep(/\.lang$/).map do |filename|
23
+ filename.sub(/\.lang$/,'').to_sym
24
+ end
25
+ end
26
+
27
+ def self.loaded_with(*languages)
28
+ scorer = self.new
29
+ if [:all] == languages
30
+ scorer.add_all_languages
31
+ else
32
+ languages.each do |language|
33
+ scorer.add_language(language)
34
+ end
35
+ end
36
+ return scorer
37
+ end
38
+
39
+ def initialize()
40
+ @languages = {}
41
+ @language_weights = Hash.new(1.0)
42
+ @language_weights[:russian] = 0.8
43
+ end
44
+
45
+ def add_language(name, weight = nil)
46
+ self.class.load_filter(name)
47
+ @languages[name] = true
48
+ @language_weights[name] = weight unless weight.nil?
49
+ end
50
+
51
+ def add_all_languages
52
+ self.class.all_languages.each do |language|
53
+ add_language(language)
54
+ end
55
+ end
56
+
57
+ def confidence(considered, results)
58
+ top_results = results.values.sort
59
+ best = top_results[-1]
60
+ rest = top_results[0..-2].inject{|number, sum| sum + number}
61
+
62
+ return OPTIMISM * best - rest
63
+ end
64
+
65
+ OPTIMISM = 3.5
66
+ MIN_CONFIDENCE = 15
67
+
68
+ def apply_weights(results)
69
+ results.keys.each do |lang|
70
+ results[lang] *= @language_weights[lang]
71
+ end
72
+ results
73
+ end
74
+
75
+ # Very inefficient method for now.. but still beats the non-Bloom
76
+ # alternatives.
77
+ # Change to better bit comparison technique later..
78
+ def process_text(text)
79
+ results = Hash.new(0)
80
+ word_count = 0
81
+ text.split(/\s+/).each do |word|
82
+ word = word.downcase
83
+ next if /^\d*$/ =~ word
84
+ @languages.keys.each do |lang|
85
+ if @@filters[lang].includes?(word)
86
+ results[lang] += 1
87
+ end
88
+ end
89
+
90
+ # Every now and then check to see if we have a really convincing result.. if so, exit early.
91
+ if word_count % 4 == 0 && results.size > 1
92
+ #break if confidence(word_count + 1, results) > MIN_CONFIDENCE
93
+ end
94
+
95
+ word_count += 1
96
+ #break if word_count > 100
97
+ end
98
+ apply_weights(results)
99
+ rescue => ex
100
+ p ex, ex.backtrace
101
+ nil
102
+ end
103
+
104
+ def language(text)
105
+ process_text(text).max { |a,b| a[1] <=> b[1] }.first rescue nil
106
+ end
107
+ end
108
+ end
metadata ADDED
@@ -0,0 +1,87 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: word-bloom
3
+ version: !ruby/object:Gem::Version
4
+ version: '0.1'
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Peter Cooper
9
+ - Judson Lester
10
+ autorequire:
11
+ bindir: bin
12
+ cert_chain: []
13
+ date: 2012-09-08 00:00:00.000000000 Z
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: corundum
17
+ requirement: &71753810 !ruby/object:Gem::Requirement
18
+ none: false
19
+ requirements:
20
+ - - ~>
21
+ - !ruby/object:Gem::Version
22
+ version: 0.0.1
23
+ type: :development
24
+ prerelease: false
25
+ version_requirements: *71753810
26
+ description: ! ' Guesses the natural language of a text sample based on bloom filter
27
+ matches of words in the text. Fast, reasonably accurate.
28
+
29
+ '
30
+ email:
31
+ - ''
32
+ - nyarly@gmail.com
33
+ executables: []
34
+ extensions: []
35
+ extra_rdoc_files: []
36
+ files:
37
+ - lib/bloominsimple.rb
38
+ - lib/word-bloom.rb
39
+ - lib/word-bloom/filter-builder.rb
40
+ - lib/word-bloom/scorer.rb
41
+ - lib/word-bloom/quality.rb
42
+ - lib/bitfield.rb
43
+ - lang/pinyin.lang
44
+ - lang/dutch.lang
45
+ - lang/french.lang
46
+ - lang/swedish.lang
47
+ - lang/russian.lang
48
+ - lang/german.lang
49
+ - lang/farsi.lang
50
+ - lang/italian.lang
51
+ - lang/portuguese.lang
52
+ - lang/english.lang
53
+ - lang/spanish.lang
54
+ homepage: http://nyarly.github.com/word-bloom/
55
+ licenses:
56
+ - MIT
57
+ post_install_message: Thanks again to Peter Cooper - JL
58
+ rdoc_options:
59
+ - --inline-source
60
+ - --main
61
+ - doc/README
62
+ - --title
63
+ - word-bloom-0.1 RDoc
64
+ require_paths:
65
+ - lib/
66
+ required_ruby_version: !ruby/object:Gem::Requirement
67
+ none: false
68
+ requirements:
69
+ - - ! '>='
70
+ - !ruby/object:Gem::Version
71
+ version: '0'
72
+ segments:
73
+ - 0
74
+ hash: 379118287
75
+ required_rubygems_version: !ruby/object:Gem::Requirement
76
+ none: false
77
+ requirements:
78
+ - - ! '>='
79
+ - !ruby/object:Gem::Version
80
+ version: '0'
81
+ requirements: []
82
+ rubyforge_project: word-bloom
83
+ rubygems_version: 1.8.15
84
+ signing_key:
85
+ specification_version: 3
86
+ summary: Natural language guessing for text samples
87
+ test_files: []