segment_ruby 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 8d59d60fbd2e0bd1d80132e2e0f18bd1241cc4bb
4
- data.tar.gz: 5025e686e618bf0b54a3ebec6ee97e46f4d769f5
3
+ metadata.gz: 26ffd5e24afa505dec8c31cac0cad0cecafc27bb
4
+ data.tar.gz: 4171510ada22f97967654a976da3c014b5fab121
5
5
  SHA512:
6
- metadata.gz: 85c15f6b79b7be0cb6c3fb3ef8800256889dc85beca996592181a9e5b136c8dae5cf5168f19c1c14fe45b7a2d86063e3c6717c65e8c4eac993a14a991e5b79a4
7
- data.tar.gz: 27bf285191eab61d8fc88acaa65574e6ef2ec32f0d4336665dd8b3799fcc84703e91872d8e9336c08846d94a4656a6ec97412d260bdd8628e0c639671b7299cc
6
+ metadata.gz: fcf6a77d8778367c64563d09be67080b935282250fc625cdb2169ea9f461a0dea2a04ef47a400b7f70ed4f2745c534cd5600b3eb13b9b94285aaf6eb65609348
7
+ data.tar.gz: c7868ead403e68c2154016150456676269376b136dce0de73c30370ce168e27abf839335a46c4854af037a155238241c79ef89a172163055dc16651ef95e5bfa
data/README.md CHANGED
@@ -23,17 +23,17 @@ Or install it yourself as:
23
23
 
24
24
  ```
25
25
  require 'segment_ruby'
26
- t = SegmentRuby::Analyzer.new('twitter'); true
26
+ t = SegmentRuby::Analyzer.new(:twitter)
27
27
  t.segment("theboywholived")
28
28
  => ["the", "boy", "who", "lived"]
29
29
  ```
30
30
  Models include:
31
31
 
32
- - `norvig`: based on Google web data
33
- - `google_books`: based on Google books data
34
- - `anchor`: based on Web anchor text
35
- - `twitter`: based on Twitter data
36
- - `small`: smaller version of the Google books data
32
+ - `:norvig`: based on Google web data
33
+ - `:google_books`: based on Google books data
34
+ - `:anchor`: based on Web anchor text
35
+ - `:twitter`: based on Twitter data
36
+ - `:small`: smaller version of the Google books data
37
37
 
38
38
  The default model is `small`. Use it if is seems to work for you.
39
39
 
@@ -1,3 +1,3 @@
1
1
  module SegmentRuby
2
- VERSION = "0.1.0"
2
+ VERSION = "0.1.1"
3
3
  end
data/lib/segment_ruby.rb CHANGED
@@ -1,4 +1,4 @@
1
- require "segment_ruby/version"
1
+ require_relative "./segment_ruby/version"
2
2
  require 'pathname'
3
3
 
4
4
  # Based on "Natural Language Corpus Data"
@@ -13,126 +13,117 @@ module SegmentRuby
13
13
  def initialize(total_file_name, data_file_name)
14
14
  @total_file_name = total_file_name
15
15
  @data_file_name = total_file_name
16
- begin
16
+
17
+ @log_total = begin
17
18
  total = File.read(total_file_name).to_i
18
- @log_total= Math.log2(total)
19
+ Math.log2(total)
19
20
  rescue
20
- @log_total= Math.log2(10**1000)
21
+ Math.log2(10**1000)
21
22
  end
22
23
 
23
- @table = Hash.new{|w| -Float::INFINITY}
24
+ @table = Hash.new { |w| -Float::INFINITY }
25
+
24
26
  File.open(data_file_name).each_line do |line|
25
27
  data = line.split(/\s/)
26
28
  freq = data[-1].to_i
27
29
  keys = data[0..-2]
28
30
  key = keys.join(' ')
29
- log_p = Math.log2(freq) - @log_total
30
- @table[key] = log_p
31
+ log_p = Math.log2(freq) - log_total
32
+
33
+ table[key] = log_p
31
34
  end
32
- true
33
35
  end
34
36
 
35
- def table
36
- @table
37
- end
37
+ attr_reader :log_total, :table
38
38
 
39
39
  def files
40
40
  [@total_file_name, @data_file_name]
41
41
  end
42
42
 
43
43
  def log_prob(w)
44
- @table[w]
44
+ table[w]
45
45
  end
46
46
 
47
47
  def prob(w)
48
- 2**@table[w]
49
- end
50
-
51
- def log_total
52
- @log_total
48
+ 2**table[w]
53
49
  end
54
50
 
55
51
  def total
56
- 2**@log_total
52
+ 2**log_total
57
53
  end
58
54
 
59
55
  def has_key?(w)
60
- @table.has_key?(w)
56
+ table.has_key?(w)
61
57
  end
62
58
  end
63
59
 
64
60
  class Analyzer
65
-
66
- def initialize(model='small', max_word_length=20)
67
- @model = model
61
+ def initialize(model_name=:small, max_word_length=20)
62
+ @model_name = model_name
68
63
  @max_word_length = max_word_length
64
+
69
65
  # unigram log probabilities
70
- @ulp = ProbabilityDistribution.new(total_file_name(''), freq_file_name(''))
66
+ @ulp = ProbabilityDistribution.new(total_file_name, freq_file_name)
67
+
71
68
  # bigram log probabilities
72
69
  btf = total_file_name('2_')
73
70
  bff = freq_file_name('2_')
74
- @blp = ((File.exists?(btf) and File.exists?(bff)) ? ProbabilityDistribution.new(btf, bff) : false)
75
- true
76
- end
77
-
78
- def model
79
- @model
71
+ @blp = (File.exists?(btf) and File.exists?(bff) ? ProbabilityDistribution.new(btf, bff) : false)
80
72
  end
81
73
 
82
- def max_word_length
83
- @max_word_length
84
- end
85
-
86
- def ulp
87
- @ulp
88
- end
89
-
90
- def blp
91
- @blp
92
- end
74
+ attr_reader :blp, :max_word_length, :model_name, :ulp
93
75
 
94
76
  def log_Pr(w)
95
- @ulp.log_prob(w)
77
+ ulp.log_prob(w)
96
78
  end
97
79
 
98
80
  def log_CPr(w, prev)
99
81
  key = [prev, w].join(' ')
100
- (@blp and @blp.has_key?(key)) ? @blp.log_prob(key) : @ulp.log_prob(w)
82
+
83
+ blp and blp.has_key?(key) ? blp.log_prob(key) : ulp.log_prob(w)
84
+ end
85
+
86
+ def total_file_name(prefix='')
87
+ File.join(model_path, prefix + 'total.tsv')
101
88
  end
102
89
 
103
- def total_file_name(prefix)
104
- File.join(__dir__, "..", "data", "segment_ruby", @model, prefix + 'total.tsv')
90
+ def freq_file_name(prefix='')
91
+ File.join(model_path, prefix + 'frequencies.tsv')
105
92
  end
106
93
 
107
- def freq_file_name(prefix)
108
- File.join(__dir__, "..", "data", "segment_ruby", @model, prefix + 'frequencies.tsv')
94
+ def model_path
95
+ @model_path ||= File.join(__dir__, "..", "data", "segment_ruby", model_name.to_s)
109
96
  end
110
97
 
111
98
  # Returns all the splits of a string up to a given length
112
99
  def splits(text)
113
- (0..[@max_word_length,text.size-1].min).map{|i| [text[0..i], text[i+1..text.size] ] }
100
+ (0..[max_word_length, text.size-1].min).map { |i| [text[0..i], text[i+1..text.size]] }
114
101
  end
115
102
 
116
103
  def combine(pFirst, first, segmented)
117
104
  pRem,rem = segmented
105
+
118
106
  [pFirst+pRem, [first]+rem]
119
107
  end
120
108
 
121
109
  def segment_r(text, prev, n, memo)
122
110
  return [0.0, []] if not text or (text.size == 0)
123
111
  return memo[text] if memo.has_key?(text)
112
+
124
113
  log_p_segment = splits(text).map do |first, rem|
125
114
  log_p = log_CPr(first, prev)
126
115
  combine(log_p, first, segment_r(rem, first, n+1, memo))
127
116
  end.max
117
+
128
118
  memo[text] = log_p_segment
119
+
129
120
  log_p_segment
130
121
  end
131
122
 
132
123
  def segment(text, prev='<S>')
133
- p, segmentation = segment_r(text, prev, 0, Hash.new)
124
+ _, segmentation = segment_r(text, prev, 0, Hash.new)
125
+
134
126
  segmentation
135
127
  end
136
-
137
128
  end
138
129
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: segment_ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Will Fitzgerald
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2016-10-13 00:00:00.000000000 Z
11
+ date: 2016-10-17 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler