segment_ruby 0.1.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 8d59d60fbd2e0bd1d80132e2e0f18bd1241cc4bb
4
- data.tar.gz: 5025e686e618bf0b54a3ebec6ee97e46f4d769f5
3
+ metadata.gz: 26ffd5e24afa505dec8c31cac0cad0cecafc27bb
4
+ data.tar.gz: 4171510ada22f97967654a976da3c014b5fab121
5
5
  SHA512:
6
- metadata.gz: 85c15f6b79b7be0cb6c3fb3ef8800256889dc85beca996592181a9e5b136c8dae5cf5168f19c1c14fe45b7a2d86063e3c6717c65e8c4eac993a14a991e5b79a4
7
- data.tar.gz: 27bf285191eab61d8fc88acaa65574e6ef2ec32f0d4336665dd8b3799fcc84703e91872d8e9336c08846d94a4656a6ec97412d260bdd8628e0c639671b7299cc
6
+ metadata.gz: fcf6a77d8778367c64563d09be67080b935282250fc625cdb2169ea9f461a0dea2a04ef47a400b7f70ed4f2745c534cd5600b3eb13b9b94285aaf6eb65609348
7
+ data.tar.gz: c7868ead403e68c2154016150456676269376b136dce0de73c30370ce168e27abf839335a46c4854af037a155238241c79ef89a172163055dc16651ef95e5bfa
data/README.md CHANGED
@@ -23,17 +23,17 @@ Or install it yourself as:
23
23
 
24
24
  ```
25
25
  require 'segment_ruby'
26
- t = SegmentRuby::Analyzer.new('twitter'); true
26
+ t = SegmentRuby::Analyzer.new(:twitter)
27
27
  t.segment("theboywholived")
28
28
  => ["the", "boy", "who", "lived"]
29
29
  ```
30
30
  Models include:
31
31
 
32
- - `norvig`: based on Google web data
33
- - `google_books`: based on Google books data
34
- - `anchor`: based on Web anchor text
35
- - `twitter`: based on Twitter data
36
- - `small`: smaller version of the Google books data
32
+ - `:norvig`: based on Google web data
33
+ - `:google_books`: based on Google books data
34
+ - `:anchor`: based on Web anchor text
35
+ - `:twitter`: based on Twitter data
36
+ - `:small`: smaller version of the Google books data
37
37
 
38
38
  The default model is `small`. Use it if is seems to work for you.
39
39
 
@@ -1,3 +1,3 @@
1
1
  module SegmentRuby
2
- VERSION = "0.1.0"
2
+ VERSION = "0.1.1"
3
3
  end
data/lib/segment_ruby.rb CHANGED
@@ -1,4 +1,4 @@
1
- require "segment_ruby/version"
1
+ require_relative "./segment_ruby/version"
2
2
  require 'pathname'
3
3
 
4
4
  # Based on "Natural Language Corpus Data"
@@ -13,126 +13,117 @@ module SegmentRuby
13
13
  def initialize(total_file_name, data_file_name)
14
14
  @total_file_name = total_file_name
15
15
  @data_file_name = total_file_name
16
- begin
16
+
17
+ @log_total = begin
17
18
  total = File.read(total_file_name).to_i
18
- @log_total= Math.log2(total)
19
+ Math.log2(total)
19
20
  rescue
20
- @log_total= Math.log2(10**1000)
21
+ Math.log2(10**1000)
21
22
  end
22
23
 
23
- @table = Hash.new{|w| -Float::INFINITY}
24
+ @table = Hash.new { |w| -Float::INFINITY }
25
+
24
26
  File.open(data_file_name).each_line do |line|
25
27
  data = line.split(/\s/)
26
28
  freq = data[-1].to_i
27
29
  keys = data[0..-2]
28
30
  key = keys.join(' ')
29
- log_p = Math.log2(freq) - @log_total
30
- @table[key] = log_p
31
+ log_p = Math.log2(freq) - log_total
32
+
33
+ table[key] = log_p
31
34
  end
32
- true
33
35
  end
34
36
 
35
- def table
36
- @table
37
- end
37
+ attr_reader :log_total, :table
38
38
 
39
39
  def files
40
40
  [@total_file_name, @data_file_name]
41
41
  end
42
42
 
43
43
  def log_prob(w)
44
- @table[w]
44
+ table[w]
45
45
  end
46
46
 
47
47
  def prob(w)
48
- 2**@table[w]
49
- end
50
-
51
- def log_total
52
- @log_total
48
+ 2**table[w]
53
49
  end
54
50
 
55
51
  def total
56
- 2**@log_total
52
+ 2**log_total
57
53
  end
58
54
 
59
55
  def has_key?(w)
60
- @table.has_key?(w)
56
+ table.has_key?(w)
61
57
  end
62
58
  end
63
59
 
64
60
  class Analyzer
65
-
66
- def initialize(model='small', max_word_length=20)
67
- @model = model
61
+ def initialize(model_name=:small, max_word_length=20)
62
+ @model_name = model_name
68
63
  @max_word_length = max_word_length
64
+
69
65
  # unigram log probabilities
70
- @ulp = ProbabilityDistribution.new(total_file_name(''), freq_file_name(''))
66
+ @ulp = ProbabilityDistribution.new(total_file_name, freq_file_name)
67
+
71
68
  # bigram log probabilities
72
69
  btf = total_file_name('2_')
73
70
  bff = freq_file_name('2_')
74
- @blp = ((File.exists?(btf) and File.exists?(bff)) ? ProbabilityDistribution.new(btf, bff) : false)
75
- true
76
- end
77
-
78
- def model
79
- @model
71
+ @blp = (File.exists?(btf) and File.exists?(bff) ? ProbabilityDistribution.new(btf, bff) : false)
80
72
  end
81
73
 
82
- def max_word_length
83
- @max_word_length
84
- end
85
-
86
- def ulp
87
- @ulp
88
- end
89
-
90
- def blp
91
- @blp
92
- end
74
+ attr_reader :blp, :max_word_length, :model_name, :ulp
93
75
 
94
76
  def log_Pr(w)
95
- @ulp.log_prob(w)
77
+ ulp.log_prob(w)
96
78
  end
97
79
 
98
80
  def log_CPr(w, prev)
99
81
  key = [prev, w].join(' ')
100
- (@blp and @blp.has_key?(key)) ? @blp.log_prob(key) : @ulp.log_prob(w)
82
+
83
+ blp and blp.has_key?(key) ? blp.log_prob(key) : ulp.log_prob(w)
84
+ end
85
+
86
+ def total_file_name(prefix='')
87
+ File.join(model_path, prefix + 'total.tsv')
101
88
  end
102
89
 
103
- def total_file_name(prefix)
104
- File.join(__dir__, "..", "data", "segment_ruby", @model, prefix + 'total.tsv')
90
+ def freq_file_name(prefix='')
91
+ File.join(model_path, prefix + 'frequencies.tsv')
105
92
  end
106
93
 
107
- def freq_file_name(prefix)
108
- File.join(__dir__, "..", "data", "segment_ruby", @model, prefix + 'frequencies.tsv')
94
+ def model_path
95
+ @model_path ||= File.join(__dir__, "..", "data", "segment_ruby", model_name.to_s)
109
96
  end
110
97
 
111
98
  # Returns all the splits of a string up to a given length
112
99
  def splits(text)
113
- (0..[@max_word_length,text.size-1].min).map{|i| [text[0..i], text[i+1..text.size] ] }
100
+ (0..[max_word_length, text.size-1].min).map { |i| [text[0..i], text[i+1..text.size]] }
114
101
  end
115
102
 
116
103
  def combine(pFirst, first, segmented)
117
104
  pRem,rem = segmented
105
+
118
106
  [pFirst+pRem, [first]+rem]
119
107
  end
120
108
 
121
109
  def segment_r(text, prev, n, memo)
122
110
  return [0.0, []] if not text or (text.size == 0)
123
111
  return memo[text] if memo.has_key?(text)
112
+
124
113
  log_p_segment = splits(text).map do |first, rem|
125
114
  log_p = log_CPr(first, prev)
126
115
  combine(log_p, first, segment_r(rem, first, n+1, memo))
127
116
  end.max
117
+
128
118
  memo[text] = log_p_segment
119
+
129
120
  log_p_segment
130
121
  end
131
122
 
132
123
  def segment(text, prev='<S>')
133
- p, segmentation = segment_r(text, prev, 0, Hash.new)
124
+ _, segmentation = segment_r(text, prev, 0, Hash.new)
125
+
134
126
  segmentation
135
127
  end
136
-
137
128
  end
138
129
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: segment_ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Will Fitzgerald
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2016-10-13 00:00:00.000000000 Z
11
+ date: 2016-10-17 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler