tiny_segmenter 0.0.4 → 0.0.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +2 -1
- data/README.md +4 -2
- data/lib/tiny_segmenter.rb +55 -24
- data/lib/tiny_segmenter/segmentation_model.rb +45 -43
- data/lib/tiny_segmenter/version.rb +1 -1
- data/spec/spec_helper.rb +0 -1
- data/spec/tiny_segmenter_spec.rb +14 -10
- data/tiny_segmenter.gemspec +4 -4
- metadata +21 -18
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6b4bdaf300656970e564a69ccf28cf682cda90d4
|
4
|
+
data.tar.gz: d2904cf58175c7a869dfbc10406961d7aa550213
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a5a7457261ac16d4f9ddb050e04fb5dd742dd02bb5a576ae4d47d2d8f322d7a680286617014f3245f84867414f12dcea5cb2bd2d6270909bc999b12e3091b0da
|
7
|
+
data.tar.gz: 412f0f901042b0d566d7642d01cdf8346f38c2bf3f25dc246f485afe960014a80f15a3f5ae21fd7d24f4210fb055b47f64fa7d5582898ee55ca5725baa23b703
|
data/.travis.yml
CHANGED
data/README.md
CHANGED
@@ -10,8 +10,10 @@ Ruby port of [TinySegmenter.js](http://chasen.org/~taku/software/TinySegmenter/)
|
|
10
10
|
|
11
11
|
```ruby
|
12
12
|
ts = TinySegmenter.new
|
13
|
-
|
14
|
-
# => ["今晩", "は", "!", "良い", "天気", "です", "ね"]
|
13
|
+
ts.segment("今晩は!良い天気ですね。")
|
14
|
+
# => ["今晩", "は", "!", "良い", "天気", "です", "ね", "。"]
|
15
|
+
ts.segment("今晩は!良い天気ですね。", ignore_punctuation: true)
|
16
|
+
# => ["今晩", "は", "良い", "天気", "です", "ね"]
|
15
17
|
```
|
16
18
|
|
17
19
|
Input text should be UTF-8 encoded.
|
data/lib/tiny_segmenter.rb
CHANGED
@@ -43,14 +43,20 @@ class TinySegmenter
|
|
43
43
|
p1, p2, p3 = %w[U U U]
|
44
44
|
(4..segments.size-4).to_a.each do |i|
|
45
45
|
score = @BIAS
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
46
|
+
w1 = segments[i - 3]
|
47
|
+
w2 = segments[i - 2]
|
48
|
+
w3 = segments[i - 1]
|
49
|
+
w4 = segments[i]
|
50
|
+
w5 = segments[i + 1]
|
51
|
+
w6 = segments[i + 2]
|
52
|
+
c1 = ctypes[i - 3]
|
53
|
+
c2 = ctypes[i - 2]
|
54
|
+
c3 = ctypes[i - 1]
|
55
|
+
c4 = ctypes[i]
|
56
|
+
c5 = ctypes[i + 1]
|
57
|
+
c6 = ctypes[i + 2]
|
52
58
|
|
53
|
-
score += sum_scores(p1, p2, p3,
|
59
|
+
score += sum_scores(p1, p2, p3, w1, w2, w3, w4, w5, w6, c1, c2, c3, c4, c5, c6)
|
54
60
|
p_new = "O"
|
55
61
|
if score > 0
|
56
62
|
result << word
|
@@ -75,23 +81,48 @@ class TinySegmenter
|
|
75
81
|
|
76
82
|
def sum_scores(p1, p2, p3, w1, w2, w3, w4, w5, w6, c1, c2, c3, c4, c5, c6)
|
77
83
|
score = 0
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
84
|
+
score += @model.score(:UP1, p1)
|
85
|
+
score += @model.score(:UP2, p2)
|
86
|
+
score += @model.score(:UP3, p3)
|
87
|
+
score += @model.score(:BP1, p1 + p2)
|
88
|
+
score += @model.score(:BP2, p2 + p3)
|
89
|
+
score += @model.score(:UW1, w1)
|
90
|
+
score += @model.score(:UW2, w2)
|
91
|
+
score += @model.score(:UW3, w3)
|
92
|
+
score += @model.score(:UW4, w4)
|
93
|
+
score += @model.score(:UW5, w5)
|
94
|
+
score += @model.score(:UW6, w6)
|
95
|
+
score += @model.score(:BW1, w2 + w3)
|
96
|
+
score += @model.score(:BW2, w3 + w4)
|
97
|
+
score += @model.score(:BW3, w4 + w5)
|
98
|
+
score += @model.score(:TW1, w1 + w2 + w3)
|
99
|
+
score += @model.score(:TW2, w2 + w3 + w4)
|
100
|
+
score += @model.score(:TW3, w3 + w4 + w5)
|
101
|
+
score += @model.score(:TW4, w4 + w5 + w6)
|
102
|
+
score += @model.score(:UC1, c1)
|
103
|
+
score += @model.score(:UC2, c2)
|
104
|
+
score += @model.score(:UC3, c3)
|
105
|
+
score += @model.score(:UC4, c4)
|
106
|
+
score += @model.score(:UC5, c5)
|
107
|
+
score += @model.score(:UC6, c6)
|
108
|
+
score += @model.score(:BC1, c2 + c3)
|
109
|
+
score += @model.score(:BC2, c3 + c4)
|
110
|
+
score += @model.score(:BC3, c4 + c5)
|
111
|
+
score += @model.score(:TC1, c1 + c2 + c3)
|
112
|
+
score += @model.score(:TC2, c2 + c3 + c4)
|
113
|
+
score += @model.score(:TC3, c3 + c4 + c5)
|
114
|
+
score += @model.score(:TC4, c4 + c5 + c6)
|
115
|
+
score += @model.score(:UQ1, p1 + c1)
|
116
|
+
score += @model.score(:UQ2, p2 + c2)
|
117
|
+
score += @model.score(:UQ3, p3 + c3)
|
118
|
+
score += @model.score(:BQ1, p2 + c2 + c3)
|
119
|
+
score += @model.score(:BQ2, p2 + c3 + c4)
|
120
|
+
score += @model.score(:BQ3, p3 + c2 + c3)
|
121
|
+
score += @model.score(:BQ4, p3 + c3 + c4)
|
122
|
+
score += @model.score(:TQ1, p2 + c1 + c2 + c3)
|
123
|
+
score += @model.score(:TQ2, p2 + c2 + c3 + c4)
|
124
|
+
score += @model.score(:TQ3, p3 + c1 + c2 + c3)
|
125
|
+
score += @model.score(:TQ4, p3 + c2 + c3 + c4)
|
95
126
|
score
|
96
127
|
end
|
97
128
|
end
|
@@ -2,51 +2,53 @@
|
|
2
2
|
|
3
3
|
class SegmentationModel
|
4
4
|
def initialize
|
5
|
-
@
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
5
|
+
@models = {
|
6
|
+
:BC1 => {"HH" => 6,"II" => 2461,"KH" => 406,"OH" => -1378},
|
7
|
+
:BC2 => {"AA" => -3267,"AI" => 2744,"AN" => -878,"HH" => -4070,"HM" => -1711,"HN" => 4012,"HO" => 3761,"IA" => 1327,"IH" => -1184,"II" => -1332,"IK" => 1721,"IO" => 5492,"KI" => 3831,"KK" => -8741,"MH" => -3132,"MK" => 3334,"OO" => -2920},
|
8
|
+
:BC3 => {"HH" => 996,"HI" => 626,"HK" => -721,"HN" => -1307,"HO" => -836,"IH" => -301,"KK" => 2762,"MK" => 1079,"MM" => 4034,"OA" => -1652,"OH" => 266},
|
9
|
+
:BP1 => {"BB" => 295,"OB" => 304,"OO" => -125,"UB" => 352},
|
10
|
+
:BP2 => {"BO" => 60,"OO" => -1762},
|
11
|
+
:BQ1 => {"BHH" => 1150,"BHM" => 1521,"BII" => -1158,"BIM" => 886,"BMH" => 1208,"BNH" => 449,"BOH" => -91,"BOO" => -2597,"OHI" => 451,"OIH" => -296,"OKA" => 1851,"OKH" => -1020,"OKK" => 904,"OOO" => 2965},
|
12
|
+
:BQ2 => {"BHH" => 118,"BHI" => -1159,"BHM" => 466,"BIH" => -919,"BKK" => -1720,"BKO" => 864,"OHH" => -1139,"OHM" => -181,"OIH" => 153,"UHI" => -1146},
|
13
|
+
:BQ3 => {"BHH" => -792,"BHI" => 2664,"BII" => -299,"BKI" => 419,"BMH" => 937,"BMM" => 8335,"BNN" => 998,"BOH" => 775,"OHH" => 2174,"OHM" => 439,"OII" => 280,"OKH" => 1798,"OKI" => -793,"OKO" => -2242,"OMH" => -2402,"OOO" => 11699},
|
14
|
+
:BQ4 => {"BHH" => -3895,"BIH" => 3761,"BII" => -4654,"BIK" => 1348,"BKK" => -1806,"BMI" => -3385,"BOO" => -12396,"OAH" => 926,"OHH" => 266,"OHK" => -2036,"ONN" => -973},
|
15
|
+
:BW1 => {",と" => 660,",同" => 727,"B1あ" => 1404,"B1同" => 542,"、と" => 660,"、同" => 727,"」と" => 1682,"あっ" => 1505,"いう" => 1743,"いっ" => -2055,"いる" => 672,"うし" => -4817,"うん" => 665,"から" => 3472,"がら" => 600,"こう" => -790,"こと" => 2083,"こん" => -1262,"さら" => -4143,"さん" => 4573,"した" => 2641,"して" => 1104,"すで" => -3399,"そこ" => 1977,"それ" => -871,"たち" => 1122,"ため" => 601,"った" => 3463,"つい" => -802,"てい" => 805,"てき" => 1249,"でき" => 1127,"です" => 3445,"では" => 844,"とい" => -4915,"とみ" => 1922,"どこ" => 3887,"ない" => 5713,"なっ" => 3015,"など" => 7379,"なん" => -1113,"にし" => 2468,"には" => 1498,"にも" => 1671,"に対" => -912,"の一" => -501,"の中" => 741,"ませ" => 2448,"まで" => 1711,"まま" => 2600,"まる" => -2155,"やむ" => -1947,"よっ" => -2565,"れた" => 2369,"れで" => -913,"をし" => 1860,"を見" => 731,"亡く" => -1886,"京都" => 2558,"取り" => -2784,"大き" => -2604,"大阪" => 1497,"平方" => -2314,"引き" => -1336,"日本" => -195,"本当" => -2423,"毎日" => -2113,"目指" => -724,"B1あ" => 1404,"B1同" => 542,"」と" => 1682},
|
16
|
+
:BW2 => {".." => -11822,"11" => -669,"――" => -5730,"−−" => -13175,"いう" => -1609,"うか" => 2490,"かし" => -1350,"かも" => -602,"から" => -7194,"かれ" => 4612,"がい" => 853,"がら" => -3198,"きた" => 1941,"くな" => -1597,"こと" => -8392,"この" => -4193,"させ" => 4533,"され" => 13168,"さん" => -3977,"しい" => -1819,"しか" => -545,"した" => 5078,"して" => 972,"しな" => 939,"その" => -3744,"たい" => -1253,"たた" => -662,"ただ" => -3857,"たち" => -786,"たと" => 1224,"たは" => -939,"った" => 4589,"って" => 1647,"っと" => -2094,"てい" => 6144,"てき" => 3640,"てく" => 2551,"ては" => -3110,"ても" => -3065,"でい" => 2666,"でき" => -1528,"でし" => -3828,"です" => -4761,"でも" => -4203,"とい" => 1890,"とこ" => -1746,"とと" => -2279,"との" => 720,"とみ" => 5168,"とも" => -3941,"ない" => -2488,"なが" => -1313,"など" => -6509,"なの" => 2614,"なん" => 3099,"にお" => -1615,"にし" => 2748,"にな" => 2454,"によ" => -7236,"に対" => -14943,"に従" => -4688,"に関" => -11388,"のか" => 2093,"ので" => -7059,"のに" => -6041,"のの" => -6125,"はい" => 1073,"はが" => -1033,"はず" => -2532,"ばれ" => 1813,"まし" => -1316,"まで" => -6621,"まれ" => 5409,"めて" => -3153,"もい" => 2230,"もの" => -10713,"らか" => -944,"らし" => -1611,"らに" => -1897,"りし" => 651,"りま" => 1620,"れた" => 4270,"れて" => 849,"れば" => 4114,"ろう" => 6067,"われ" => 7901,"を通" => -11877,"んだ" => 728,"んな" => -4115,"一人" => 602,"一方" => -1375,"一日" => 970,"一部" => -1051,"上が" => -4479,"会社" => -1116,"出て" => 2163,"分の" => -7758,"同党" => 970,"同日" => -913,"大阪" => -2471,"委員" => -1250,"少な" => -1050,"年度" => -8669,"年間" => -1626,"府県" => -2363,"手権" => -1982,"新聞" => -4066,"日新" => -722,"日本" => -7068,"日米" => 3372,"曜日" => -601,"朝鮮" => -2355,"本人" => -2697,"東京" => -1543,"然と" => -1384,"社会" => -1276,"立て" => -990,"第に" => -1612,"米国" => -4268,"11" => -669},
|
17
|
+
:BW3 => {"あた" => -2194,"あり" => 719,"ある" => 3846,"い." => -1185,"い。" => -1185,"いい" => 5308,"いえ" => 2079,"いく" => 3029,"いた" => 2056,"いっ" => 1883,"いる" => 5600,"いわ" => 1527,"うち" => 1117,"うと" => 4798,"えと" => 1454,"か." => 2857,"か。" => 2857,"かけ" => -743,"かっ" => -4098,"かに" => -669,"から" => 6520,"かり" => -2670,"が," => 1816,"が、" => 1816,"がき" => -4855,"がけ" => -1127,"がっ" => -913,"がら" => -4977,"がり" => -2064,"きた" => 1645,"けど" => 1374,"こと" => 7397,"この" => 1542,"ころ" => -2757,"さい" => -714,"さを" => 976,"し," => 1557,"し、" => 1557,"しい" => -3714,"した" => 3562,"して" => 1449,"しな" => 2608,"しま" => 1200,"す." => -1310,"す。" => -1310,"する" => 6521,"ず," => 3426,"ず、" => 3426,"ずに" => 841,"そう" => 428,"た." => 8875,"た。" => 8875,"たい" => -594,"たの" => 812,"たり" => -1183,"たる" => -853,"だ." => 4098,"だ。" => 4098,"だっ" => 1004,"った" => -4748,"って" => 300,"てい" => 6240,"てお" => 855,"ても" => 302,"です" => 1437,"でに" => -1482,"では" => 2295,"とう" => -1387,"とし" => 2266,"との" => 541,"とも" => -3543,"どう" => 4664,"ない" => 1796,"なく" => -903,"など" => 2135,"に," => -1021,"に、" => -1021,"にし" => 1771,"にな" => 1906,"には" => 2644,"の," => -724,"の、" => -724,"の子" => -1000,"は," => 1337,"は、" => 1337,"べき" => 2181,"まし" => 1113,"ます" => 6943,"まっ" => -1549,"まで" => 6154,"まれ" => -793,"らし" => 1479,"られ" => 6820,"るる" => 3818,"れ," => 854,"れ、" => 854,"れた" => 1850,"れて" => 1375,"れば" => -3246,"れる" => 1091,"われ" => -605,"んだ" => 606,"んで" => 798,"カ月" => 990,"会議" => 860,"入り" => 1232,"大会" => 2217,"始め" => 1681,"市" => 965,"新聞" => -5055,"日," => 974,"日、" => 974,"社会" => 2024,"カ月" => 990},
|
18
|
+
:TC1 => {"AAA" => 1093,"HHH" => 1029,"HHM" => 580,"HII" => 998,"HOH" => -390,"HOM" => -331,"IHI" => 1169,"IOH" => -142,"IOI" => -1015,"IOM" => 467,"MMH" => 187,"OOI" => -1832},
|
19
|
+
:TC2 => {"HHO" => 2088,"HII" => -1023,"HMM" => -1154,"IHI" => -1965,"KKH" => 703,"OII" => -2649},
|
20
|
+
:TC3 => {"AAA" => -294,"HHH" => 346,"HHI" => -341,"HII" => -1088,"HIK" => 731,"HOH" => -1486,"IHH" => 128,"IHI" => -3041,"IHO" => -1935,"IIH" => -825,"IIM" => -1035,"IOI" => -542,"KHH" => -1216,"KKA" => 491,"KKH" => -1217,"KOK" => -1009,"MHH" => -2694,"MHM" => -457,"MHO" => 123,"MMH" => -471,"NNH" => -1689,"NNO" => 662,"OHO" => -3393},
|
21
|
+
:TC4 => {"HHH" => -203,"HHI" => 1344,"HHK" => 365,"HHM" => -122,"HHN" => 182,"HHO" => 669,"HIH" => 804,"HII" => 679,"HOH" => 446,"IHH" => 695,"IHO" => -2324,"IIH" => 321,"III" => 1497,"IIO" => 656,"IOO" => 54,"KAK" => 4845,"KKA" => 3386,"KKK" => 3065,"MHH" => -405,"MHI" => 201,"MMH" => -241,"MMM" => 661,"MOM" => 841},
|
22
|
+
:TQ1 => {"BHHH" => -227,"BHHI" => 316,"BHIH" => -132,"BIHH" => 60,"BIII" => 1595,"BNHH" => -744,"BOHH" => 225,"BOOO" => -908,"OAKK" => 482,"OHHH" => 281,"OHIH" => 249,"OIHI" => 200,"OIIH" => -68},
|
23
|
+
:TQ2 => {"BIHH" => -1401,"BIII" => -1033,"BKAK" => -543,"BOOO" => -5591},
|
24
|
+
:TQ3 => {"BHHH" => 478,"BHHM" => -1073,"BHIH" => 222,"BHII" => -504,"BIIH" => -116,"BIII" => -105,"BMHI" => -863,"BMHM" => -464,"BOMH" => 620,"OHHH" => 346,"OHHI" => 1729,"OHII" => 997,"OHMH" => 481,"OIHH" => 623,"OIIH" => 1344,"OKAK" => 2792,"OKHH" => 587,"OKKA" => 679,"OOHH" => 110,"OOII" => -685},
|
25
|
+
:TQ4 => {"BHHH" => -721,"BHHM" => -3604,"BHII" => -966,"BIIH" => -607,"BIII" => -2181,"OAAA" => -2763,"OAKK" => 180,"OHHH" => -294,"OHHI" => 2446,"OHHO" => 480,"OHIH" => -1573,"OIHH" => 1935,"OIHI" => -493,"OIIH" => 626,"OIII" => -4007,"OKAK" => -8156},
|
26
|
+
:TW1 => {"につい" => -4681,"東京都" => 2026},
|
27
|
+
:TW2 => {"ある程" => -2049,"いった" => -1256,"ころが" => -2434,"しょう" => 3873,"その後" => -4430,"だって" => -1049,"ていた" => 1833,"として" => -4657,"ともに" => -4517,"もので" => 1882,"一気に" => -792,"初めて" => -1512,"同時に" => -8097,"大きな" => -1255,"対して" => -2721,"社会党" => -3216},
|
28
|
+
:TW3 => {"いただ" => -1734,"してい" => 1314,"として" => -4314,"につい" => -5483,"にとっ" => -5989,"に当た" => -6247,"ので," => -727,"ので、" => -727,"のもの" => -600,"れから" => -3752,"十二月" => -2287},
|
29
|
+
:TW4 => {"いう." => 8576,"いう。" => 8576,"からな" => -2348,"してい" => 2958,"たが," => 1516,"たが、" => 1516,"ている" => 1538,"という" => 1349,"ました" => 5543,"ません" => 1097,"ようと" => -4258,"よると" => 5865},
|
30
|
+
:UC1 => {"A" => 484,"K" => 93,"M" => 645,"O" => -505},
|
31
|
+
:UC2 => {"A" => 819,"H" => 1059,"I" => 409,"M" => 3987,"N" => 5775,"O" => 646},
|
32
|
+
:UC3 => {"A" => -1370,"I" => 2311},
|
33
|
+
:UC4 => {"A" => -2643,"H" => 1809,"I" => -1032,"K" => -3450,"M" => 3565,"N" => 3876,"O" => 6646},
|
34
|
+
:UC5 => {"H" => 313,"I" => -1238,"K" => -799,"M" => 539,"O" => -831},
|
35
|
+
:UC6 => {"H" => -506,"I" => -253,"K" => 87,"M" => 247,"O" => -387},
|
36
|
+
:UP1 => {"O" => -214},
|
37
|
+
:UP2 => {"B" => 69,"O" => 935},
|
38
|
+
:UP3 => {"B" => 189},
|
39
|
+
:UQ1 => {"BH" => 21,"BI" => -12,"BK" => -99,"BN" => 142,"BO" => -56,"OH" => -95,"OI" => 477,"OK" => 410,"OO" => -2422},
|
40
|
+
:UQ2 => {"BH" => 216,"BI" => 113,"OK" => 1759},
|
41
|
+
:UQ3 => {"BA" => -479,"BH" => 42,"BI" => 1913,"BK" => -7198,"BM" => 3160,"BN" => 6427,"BO" => 14761,"OI" => -827,"ON" => -3212},
|
42
|
+
:UW1 => {"," => 156,"、" => 156,"「" => -463,"あ" => -941,"う" => -127,"が" => -553,"き" => 121,"こ" => 505,"で" => -201,"と" => -547,"ど" => -123,"に" => -789,"の" => -185,"は" => -847,"も" => -466,"や" => -470,"よ" => 182,"ら" => -292,"り" => 208,"れ" => 169,"を" => -446,"ん" => -137,"・" => -135,"主" => -402,"京" => -268,"区" => -912,"午" => 871,"国" => -460,"大" => 561,"委" => 729,"市" => -411,"日" => -141,"理" => 361,"生" => -408,"県" => -386,"都" => -718,"「" => -463,"・" => -135},
|
43
|
+
:UW2 => {"," => -829,"、" => -829,"〇" => 892,"「" => -645,"」" => 3145,"あ" => -538,"い" => 505,"う" => 134,"お" => -502,"か" => 1454,"が" => -856,"く" => -412,"こ" => 1141,"さ" => 878,"ざ" => 540,"し" => 1529,"す" => -675,"せ" => 300,"そ" => -1011,"た" => 188,"だ" => 1837,"つ" => -949,"て" => -291,"で" => -268,"と" => -981,"ど" => 1273,"な" => 1063,"に" => -1764,"の" => 130,"は" => -409,"ひ" => -1273,"べ" => 1261,"ま" => 600,"も" => -1263,"や" => -402,"よ" => 1639,"り" => -579,"る" => -694,"れ" => 571,"を" => -2516,"ん" => 2095,"ア" => -587,"カ" => 306,"キ" => 568,"ッ" => 831,"三" => -758,"不" => -2150,"世" => -302,"中" => -968,"主" => -861,"事" => 492,"人" => -123,"会" => 978,"保" => 362,"入" => 548,"初" => -3025,"副" => -1566,"北" => -3414,"区" => -422,"大" => -1769,"天" => -865,"太" => -483,"子" => -1519,"学" => 760,"実" => 1023,"小" => -2009,"市" => -813,"年" => -1060,"強" => 1067,"手" => -1519,"揺" => -1033,"政" => 1522,"文" => -1355,"新" => -1682,"日" => -1815,"明" => -1462,"最" => -630,"朝" => -1843,"本" => -1650,"東" => -931,"果" => -665,"次" => -2378,"民" => -180,"気" => -1740,"理" => 752,"発" => 529,"目" => -1584,"相" => -242,"県" => -1165,"立" => -763,"第" => 810,"米" => 509,"自" => -1353,"行" => 838,"西" => -744,"見" => -3874,"調" => 1010,"議" => 1198,"込" => 3041,"開" => 1758,"間" => -1257,"「" => -645,"」" => 3145,"ッ" => 831,"ア" => -587,"カ" => 306,"キ" => 568},
|
44
|
+
:UW3 => {"," => 4889,"1" => -800,"−" => -1723,"、" => 4889,"々" => -2311,"〇" => 5827,"」" => 2670,"〓" => -3573,"あ" => -2696,"い" => 1006,"う" => 2342,"え" => 1983,"お" => -4864,"か" => -1163,"が" => 3271,"く" => 1004,"け" => 388,"げ" => 401,"こ" => -3552,"ご" => -3116,"さ" => -1058,"し" => -395,"す" => 584,"せ" => 3685,"そ" => -5228,"た" => 842,"ち" => -521,"っ" => -1444,"つ" => -1081,"て" => 6167,"で" => 2318,"と" => 1691,"ど" => -899,"な" => -2788,"に" => 2745,"の" => 4056,"は" => 4555,"ひ" => -2171,"ふ" => -1798,"へ" => 1199,"ほ" => -5516,"ま" => -4384,"み" => -120,"め" => 1205,"も" => 2323,"や" => -788,"よ" => -202,"ら" => 727,"り" => 649,"る" => 5905,"れ" => 2773,"わ" => -1207,"を" => 6620,"ん" => -518,"ア" => 551,"グ" => 1319,"ス" => 874,"ッ" => -1350,"ト" => 521,"ム" => 1109,"ル" => 1591,"ロ" => 2201,"ン" => 278,"・" => -3794,"一" => -1619,"下" => -1759,"世" => -2087,"両" => 3815,"中" => 653,"主" => -758,"予" => -1193,"二" => 974,"人" => 2742,"今" => 792,"他" => 1889,"以" => -1368,"低" => 811,"何" => 4265,"作" => -361,"保" => -2439,"元" => 4858,"党" => 3593,"全" => 1574,"公" => -3030,"六" => 755,"共" => -1880,"円" => 5807,"再" => 3095,"分" => 457,"初" => 2475,"別" => 1129,"前" => 2286,"副" => 4437,"力" => 365,"動" => -949,"務" => -1872,"化" => 1327,"北" => -1038,"区" => 4646,"千" => -2309,"午" => -783,"協" => -1006,"口" => 483,"右" => 1233,"各" => 3588,"合" => -241,"同" => 3906,"和" => -837,"員" => 4513,"国" => 642,"型" => 1389,"場" => 1219,"外" => -241,"妻" => 2016,"学" => -1356,"安" => -423,"実" => -1008,"家" => 1078,"小" => -513,"少" => -3102,"州" => 1155,"市" => 3197,"平" => -1804,"年" => 2416,"広" => -1030,"府" => 1605,"度" => 1452,"建" => -2352,"当" => -3885,"得" => 1905,"思" => -1291,"性" => 1822,"戸" => -488,"指" => -3973,"政" => -2013,"教" => -1479,"数" => 3222,"文" => -1489,"新" => 1764,"日" => 2099,"旧" => 5792,"昨" => -661,"時" => -1248,"曜" => -951,"最" => -937,"月" => 4125,"期" => 360,"李" => 3094,"村" => 364,"東" => -805,"核" => 5156,"森" => 2438,"業" => 484,"氏" => 2613,"民" => -1694,"決" => -1073,"法" => 1868,"海" => -495,"無" => 979,"物" => 461,"特" => -3850,"生" => -273,"用" => 914,"町" => 1215,"的" => 7313,"直" => -1835,"省" => 792,"県" => 6293,"知" => -1528,"私" => 4231,"税" => 401,"立" => -960,"第" => 1201,"米" => 7767,"系" => 3066,"約" => 3663,"級" => 1384,"統" => -4229,"総" => 1163,"線" => 1255,"者" => 6457,"能" => 725,"自" => -2869,"英" => 785,"見" => 1044,"調" => -562,"財" => -733,"費" => 1777,"車" => 1835,"軍" => 1375,"込" => -1504,"通" => -1136,"選" => -681,"郎" => 1026,"郡" => 4404,"部" => 1200,"金" => 2163,"長" => 421,"開" => -1432,"間" => 1302,"関" => -1282,"雨" => 2009,"電" => -1045,"非" => 2066,"駅" => 1620,"1" => -800,"」" => 2670,"・" => -3794,"ッ" => -1350,"ア" => 551,"グ" => 1319,"ス" => 874,"ト" => 521,"ム" => 1109,"ル" => 1591,"ロ" => 2201,"ン" => 278},
|
45
|
+
:UW4 => {"," => 3930,"." => 3508,"―" => -4841,"、" => 3930,"。" => 3508,"〇" => 4999,"「" => 1895,"」" => 3798,"〓" => -5156,"あ" => 4752,"い" => -3435,"う" => -640,"え" => -2514,"お" => 2405,"か" => 530,"が" => 6006,"き" => -4482,"ぎ" => -3821,"く" => -3788,"け" => -4376,"げ" => -4734,"こ" => 2255,"ご" => 1979,"さ" => 2864,"し" => -843,"じ" => -2506,"す" => -731,"ず" => 1251,"せ" => 181,"そ" => 4091,"た" => 5034,"だ" => 5408,"ち" => -3654,"っ" => -5882,"つ" => -1659,"て" => 3994,"で" => 7410,"と" => 4547,"な" => 5433,"に" => 6499,"ぬ" => 1853,"ね" => 1413,"の" => 7396,"は" => 8578,"ば" => 1940,"ひ" => 4249,"び" => -4134,"ふ" => 1345,"へ" => 6665,"べ" => -744,"ほ" => 1464,"ま" => 1051,"み" => -2082,"む" => -882,"め" => -5046,"も" => 4169,"ゃ" => -2666,"や" => 2795,"ょ" => -1544,"よ" => 3351,"ら" => -2922,"り" => -9726,"る" => -14896,"れ" => -2613,"ろ" => -4570,"わ" => -1783,"を" => 13150,"ん" => -2352,"カ" => 2145,"コ" => 1789,"セ" => 1287,"ッ" => -724,"ト" => -403,"メ" => -1635,"ラ" => -881,"リ" => -541,"ル" => -856,"ン" => -3637,"・" => -4371,"ー" => -11870,"一" => -2069,"中" => 2210,"予" => 782,"事" => -190,"井" => -1768,"人" => 1036,"以" => 544,"会" => 950,"体" => -1286,"作" => 530,"側" => 4292,"先" => 601,"党" => -2006,"共" => -1212,"内" => 584,"円" => 788,"初" => 1347,"前" => 1623,"副" => 3879,"力" => -302,"動" => -740,"務" => -2715,"化" => 776,"区" => 4517,"協" => 1013,"参" => 1555,"合" => -1834,"和" => -681,"員" => -910,"器" => -851,"回" => 1500,"国" => -619,"園" => -1200,"地" => 866,"場" => -1410,"塁" => -2094,"士" => -1413,"多" => 1067,"大" => 571,"子" => -4802,"学" => -1397,"定" => -1057,"寺" => -809,"小" => 1910,"屋" => -1328,"山" => -1500,"島" => -2056,"川" => -2667,"市" => 2771,"年" => 374,"庁" => -4556,"後" => 456,"性" => 553,"感" => 916,"所" => -1566,"支" => 856,"改" => 787,"政" => 2182,"教" => 704,"文" => 522,"方" => -856,"日" => 1798,"時" => 1829,"最" => 845,"月" => -9066,"木" => -485,"来" => -442,"校" => -360,"業" => -1043,"氏" => 5388,"民" => -2716,"気" => -910,"沢" => -939,"済" => -543,"物" => -735,"率" => 672,"球" => -1267,"生" => -1286,"産" => -1101,"田" => -2900,"町" => 1826,"的" => 2586,"目" => 922,"省" => -3485,"県" => 2997,"空" => -867,"立" => -2112,"第" => 788,"米" => 2937,"系" => 786,"約" => 2171,"経" => 1146,"統" => -1169,"総" => 940,"線" => -994,"署" => 749,"者" => 2145,"能" => -730,"般" => -852,"行" => -792,"規" => 792,"警" => -1184,"議" => -244,"谷" => -1000,"賞" => 730,"車" => -1481,"軍" => 1158,"輪" => -1433,"込" => -3370,"近" => 929,"道" => -1291,"選" => 2596,"郎" => -4866,"都" => 1192,"野" => -1100,"銀" => -2213,"長" => 357,"間" => -2344,"院" => -2297,"際" => -2604,"電" => -878,"領" => -1659,"題" => -792,"館" => -1984,"首" => 1749,"高" => 2120,"「" => 1895,"」" => 3798,"・" => -4371,"ッ" => -724,"ー" => -11870,"カ" => 2145,"コ" => 1789,"セ" => 1287,"ト" => -403,"メ" => -1635,"ラ" => -881,"リ" => -541,"ル" => -856,"ン" => -3637},
|
46
|
+
:UW5 => {"," => 465,"." => -299,"1" => -514,"E2" => -32768,"]" => -2762,"、" => 465,"。" => -299,"「" => 363,"あ" => 1655,"い" => 331,"う" => -503,"え" => 1199,"お" => 527,"か" => 647,"が" => -421,"き" => 1624,"ぎ" => 1971,"く" => 312,"げ" => -983,"さ" => -1537,"し" => -1371,"す" => -852,"だ" => -1186,"ち" => 1093,"っ" => 52,"つ" => 921,"て" => -18,"で" => -850,"と" => -127,"ど" => 1682,"な" => -787,"に" => -1224,"の" => -635,"は" => -578,"べ" => 1001,"み" => 502,"め" => 865,"ゃ" => 3350,"ょ" => 854,"り" => -208,"る" => 429,"れ" => 504,"わ" => 419,"を" => -1264,"ん" => 327,"イ" => 241,"ル" => 451,"ン" => -343,"中" => -871,"京" => 722,"会" => -1153,"党" => -654,"務" => 3519,"区" => -901,"告" => 848,"員" => 2104,"大" => -1296,"学" => -548,"定" => 1785,"嵐" => -1304,"市" => -2991,"席" => 921,"年" => 1763,"思" => 872,"所" => -814,"挙" => 1618,"新" => -1682,"日" => 218,"月" => -4353,"査" => 932,"格" => 1356,"機" => -1508,"氏" => -1347,"田" => 240,"町" => -3912,"的" => -3149,"相" => 1319,"省" => -1052,"県" => -4003,"研" => -997,"社" => -278,"空" => -813,"統" => 1955,"者" => -2233,"表" => 663,"語" => -1073,"議" => 1219,"選" => -1018,"郎" => -368,"長" => 786,"間" => 1191,"題" => 2368,"館" => -689,"1" => -514,"E2" => -32768,"「" => 363,"イ" => 241,"ル" => 451,"ン" => -343},
|
47
|
+
:UW6 => {"," => 227,"." => 808,"1" => -270,"E1" => 306,"、" => 227,"。" => 808,"あ" => -307,"う" => 189,"か" => 241,"が" => -73,"く" => -121,"こ" => -200,"じ" => 1782,"す" => 383,"た" => -428,"っ" => 573,"て" => -1014,"で" => 101,"と" => -105,"な" => -253,"に" => -149,"の" => -417,"は" => -236,"も" => -206,"り" => 187,"る" => -135,"を" => 195,"ル" => -673,"ン" => -496,"一" => -277,"中" => 201,"件" => -800,"会" => 624,"前" => 302,"区" => 1792,"員" => -1212,"委" => 798,"学" => -960,"市" => 887,"広" => -695,"後" => 535,"業" => -697,"相" => 753,"社" => -507,"福" => 974,"空" => -822,"者" => 1811,"連" => 463,"郎" => 1082,"1" => -270,"E1" => 306,"ル" => -673,"ン" => -496},
|
48
|
+
}
|
47
49
|
end
|
48
50
|
|
49
51
|
def score(category, pattern)
|
50
|
-
|
52
|
+
@models[category][pattern] || 0
|
51
53
|
end
|
52
54
|
end
|
data/spec/spec_helper.rb
CHANGED
@@ -7,7 +7,6 @@ require 'tiny_segmenter'
|
|
7
7
|
#
|
8
8
|
# See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
|
9
9
|
RSpec.configure do |config|
|
10
|
-
config.treat_symbols_as_metadata_keys_with_true_values = true
|
11
10
|
config.run_all_when_everything_filtered = true
|
12
11
|
config.filter_run :focus
|
13
12
|
end
|
data/spec/tiny_segmenter_spec.rb
CHANGED
@@ -2,43 +2,47 @@
|
|
2
2
|
require 'spec_helper'
|
3
3
|
|
4
4
|
describe TinySegmenter do
|
5
|
-
subject{
|
5
|
+
subject{ described_class.new }
|
6
6
|
|
7
7
|
describe "#segment" do
|
8
8
|
it "tokenizes Japanese text fairly accurately" do
|
9
|
-
subject.segment("極めてコンパクトな日本語分かち書きソフトウェアです。").
|
10
|
-
["極めて", "コンパクト", "な", "日本", "語分", "かち", "書き", "ソフトウェア", "です", "。"]
|
9
|
+
expect(subject.segment("極めてコンパクトな日本語分かち書きソフトウェアです。")).to \
|
10
|
+
eq(["極めて", "コンパクト", "な", "日本", "語分", "かち", "書き", "ソフトウェア", "です", "。"])
|
11
11
|
end
|
12
12
|
|
13
13
|
it "removes any whitespace-only or empty tokens" do
|
14
|
-
subject.segment("書かれた 極めて コンパクト").
|
14
|
+
expect(subject.segment("書かれた 極めて コンパクト")).not_to include("", " ", nil)
|
15
15
|
end
|
16
16
|
|
17
17
|
it "removes full-width space (U+3000) tokens" do
|
18
18
|
sentence = "すてき! 男性が歌う「夢やぶれて」もいいね。"
|
19
19
|
full_width_space = " "
|
20
|
-
sentence.
|
21
|
-
subject.segment(sentence).
|
20
|
+
expect(sentence).to include(full_width_space)
|
21
|
+
expect(subject.segment(sentence)).not_to include (full_width_space)
|
22
22
|
end
|
23
23
|
|
24
24
|
it "tokenizes interspersed non-Japanese words correctly" do
|
25
|
-
subject.segment("TinySegmenterはRubyだけで").
|
25
|
+
expect(subject.segment("TinySegmenterはRubyだけで")).to \
|
26
|
+
eq(["TinySegmenter", "は", "Ruby", "だけ", "で"])
|
26
27
|
end
|
27
28
|
|
28
29
|
context "with ignore_punctuation option not set" do
|
29
30
|
it "includes punctuation-only tokens" do
|
30
|
-
subject.segment("すてき!? 男性が、歌う「夢やぶれて」もいいね。...").
|
31
|
+
expect(subject.segment("すてき!? 男性が、歌う「夢やぶれて」もいいね。...")).to \
|
32
|
+
include("。", "!", "?", "、", "「", "」", "...")
|
31
33
|
end
|
32
34
|
end
|
33
35
|
|
34
36
|
context "with ignore_punctuation option set" do
|
35
37
|
it "removes all punctuation-only tokens" do
|
36
|
-
subject.segment("すてき!? 男性が、歌う「夢やぶれて」もいいね。...", ignore_punctuation: true).
|
38
|
+
expect(subject.segment("すてき!? 男性が、歌う「夢やぶれて」もいいね。...", ignore_punctuation: true)).not_to \
|
39
|
+
include("。", "!", "?", "、", "「", "」", "...")
|
37
40
|
end
|
38
41
|
end
|
39
42
|
end
|
40
43
|
|
41
44
|
it "has a version" do
|
42
|
-
|
45
|
+
expect(described_class::VERSION).to be_kind_of(String)
|
46
|
+
expect(described_class::VERSION).not_to be_empty
|
43
47
|
end
|
44
48
|
end
|
data/tiny_segmenter.gemspec
CHANGED
@@ -5,9 +5,9 @@ require 'tiny_segmenter/version'
|
|
5
5
|
Gem::Specification.new do |s|
|
6
6
|
s.name = 'tiny_segmenter'
|
7
7
|
s.version = TinySegmenter::VERSION
|
8
|
-
s.
|
8
|
+
s.licenses = ['BSD']
|
9
9
|
s.summary = "Ruby port of TinySegmenter.js for tokenizing Japanese text."
|
10
|
-
s.description = "Ruby port of TinySegmenter.js for tokenizing Japanese text."
|
10
|
+
s.description = "Ruby port of TinySegmenter.js for tokenizing Japanese text. Uses a Naive Bayes model that has been trained using the RWCP corpus and optimized using L1-norm regularization. The resultant model is quite compact, yet has a 95% accuracy rate."
|
11
11
|
s.authors = ["Peter Graham"]
|
12
12
|
s.email = ["pete@gigadrill.com"]
|
13
13
|
s.files = `git ls-files`.split("\n")
|
@@ -15,6 +15,6 @@ Gem::Specification.new do |s|
|
|
15
15
|
s.require_paths = ["lib"]
|
16
16
|
s.homepage = 'http://github.com/6/tiny_segmenter'
|
17
17
|
|
18
|
-
s.add_development_dependency "rake"
|
19
|
-
s.add_development_dependency "rspec"
|
18
|
+
s.add_development_dependency "rake", "~> 10.4"
|
19
|
+
s.add_development_dependency "rspec", "~> 3.3"
|
20
20
|
end
|
metadata
CHANGED
@@ -1,53 +1,55 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tiny_segmenter
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Peter Graham
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2015-10-26 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rake
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- -
|
17
|
+
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: '
|
19
|
+
version: '10.4'
|
20
20
|
type: :development
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
|
-
- -
|
24
|
+
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: '
|
26
|
+
version: '10.4'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: rspec
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
|
-
- -
|
31
|
+
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: '
|
33
|
+
version: '3.3'
|
34
34
|
type: :development
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
|
-
- -
|
38
|
+
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version: '
|
41
|
-
description: Ruby port of TinySegmenter.js for tokenizing Japanese text.
|
40
|
+
version: '3.3'
|
41
|
+
description: Ruby port of TinySegmenter.js for tokenizing Japanese text. Uses a Naive
|
42
|
+
Bayes model that has been trained using the RWCP corpus and optimized using L1-norm
|
43
|
+
regularization. The resultant model is quite compact, yet has a 95% accuracy rate.
|
42
44
|
email:
|
43
45
|
- pete@gigadrill.com
|
44
46
|
executables: []
|
45
47
|
extensions: []
|
46
48
|
extra_rdoc_files: []
|
47
49
|
files:
|
48
|
-
- .gitignore
|
49
|
-
- .rspec
|
50
|
-
- .travis.yml
|
50
|
+
- ".gitignore"
|
51
|
+
- ".rspec"
|
52
|
+
- ".travis.yml"
|
51
53
|
- Gemfile
|
52
54
|
- README.md
|
53
55
|
- Rakefile
|
@@ -58,7 +60,8 @@ files:
|
|
58
60
|
- spec/tiny_segmenter_spec.rb
|
59
61
|
- tiny_segmenter.gemspec
|
60
62
|
homepage: http://github.com/6/tiny_segmenter
|
61
|
-
licenses:
|
63
|
+
licenses:
|
64
|
+
- BSD
|
62
65
|
metadata: {}
|
63
66
|
post_install_message:
|
64
67
|
rdoc_options: []
|
@@ -66,17 +69,17 @@ require_paths:
|
|
66
69
|
- lib
|
67
70
|
required_ruby_version: !ruby/object:Gem::Requirement
|
68
71
|
requirements:
|
69
|
-
- -
|
72
|
+
- - ">="
|
70
73
|
- !ruby/object:Gem::Version
|
71
74
|
version: '0'
|
72
75
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
73
76
|
requirements:
|
74
|
-
- -
|
77
|
+
- - ">="
|
75
78
|
- !ruby/object:Gem::Version
|
76
79
|
version: '0'
|
77
80
|
requirements: []
|
78
81
|
rubyforge_project:
|
79
|
-
rubygems_version: 2.
|
82
|
+
rubygems_version: 2.4.5.1
|
80
83
|
signing_key:
|
81
84
|
specification_version: 4
|
82
85
|
summary: Ruby port of TinySegmenter.js for tokenizing Japanese text.
|