tiny_segmenter 0.0.4 → 0.0.6

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: ed996e5195f9e29609e110d175c3d002875e5496
4
- data.tar.gz: 00405bd85ee522fc3d0b7271cf46fe06bca298a8
3
+ metadata.gz: 6b4bdaf300656970e564a69ccf28cf682cda90d4
4
+ data.tar.gz: d2904cf58175c7a869dfbc10406961d7aa550213
5
5
  SHA512:
6
- metadata.gz: e5e0455da693e7eba988f3a921d05c13db13756d9658b0481643963d35367cc207ea02a61f45e02be288ba0a4315b202fc95e6c3bbf88b2fb97b74df6afcd8bd
7
- data.tar.gz: b3d581557d7097c012bb565ee736265892b686f0b7b4fa286c07409e785da2664cb46f4e7971417d530a277ed8cb302df632c91fe7b85d106ef0d2b20b01a24c
6
+ metadata.gz: a5a7457261ac16d4f9ddb050e04fb5dd742dd02bb5a576ae4d47d2d8f322d7a680286617014f3245f84867414f12dcea5cb2bd2d6270909bc999b12e3091b0da
7
+ data.tar.gz: 412f0f901042b0d566d7642d01cdf8346f38c2bf3f25dc246f485afe960014a80f15a3f5ae21fd7d24f4210fb055b47f64fa7d5582898ee55ca5725baa23b703
@@ -3,5 +3,6 @@ rvm:
3
3
  - 1.9.2
4
4
  - 1.9.3
5
5
  - 2.0.0
6
+ - 2.1.7
7
+ - 2.2.3
6
8
  - jruby-19mode # JRuby in 1.9 mode
7
- - jruby-20mode
data/README.md CHANGED
@@ -10,8 +10,10 @@ Ruby port of [TinySegmenter.js](http://chasen.org/~taku/software/TinySegmenter/)
10
10
 
11
11
  ```ruby
12
12
  ts = TinySegmenter.new
13
- p ts.segment("今晩は!良い天気ですね")
14
- # => ["今晩", "は", "!", "良い", "天気", "です", "ね"]
13
+ ts.segment("今晩は!良い天気ですね。")
14
+ # => ["今晩", "は", "!", "良い", "天気", "です", "ね", "。"]
15
+ ts.segment("今晩は!良い天気ですね。", ignore_punctuation: true)
16
+ # => ["今晩", "は", "良い", "天気", "です", "ね"]
15
17
  ```
16
18
 
17
19
  Input text should be UTF-8 encoded.
@@ -43,14 +43,20 @@ class TinySegmenter
43
43
  p1, p2, p3 = %w[U U U]
44
44
  (4..segments.size-4).to_a.each do |i|
45
45
  score = @BIAS
46
- words = []
47
- chars = []
48
- (-3..2).to_a.each do |idx|
49
- words << segments[i + idx]
50
- chars << ctypes[i + idx]
51
- end
46
+ w1 = segments[i - 3]
47
+ w2 = segments[i - 2]
48
+ w3 = segments[i - 1]
49
+ w4 = segments[i]
50
+ w5 = segments[i + 1]
51
+ w6 = segments[i + 2]
52
+ c1 = ctypes[i - 3]
53
+ c2 = ctypes[i - 2]
54
+ c3 = ctypes[i - 1]
55
+ c4 = ctypes[i]
56
+ c5 = ctypes[i + 1]
57
+ c6 = ctypes[i + 2]
52
58
 
53
- score += sum_scores(p1, p2, p3, *words, *chars)
59
+ score += sum_scores(p1, p2, p3, w1, w2, w3, w4, w5, w6, c1, c2, c3, c4, c5, c6)
54
60
  p_new = "O"
55
61
  if score > 0
56
62
  result << word
@@ -75,23 +81,48 @@ class TinySegmenter
75
81
 
76
82
  def sum_scores(p1, p2, p3, w1, w2, w3, w4, w5, w6, c1, c2, c3, c4, c5, c6)
77
83
  score = 0
78
- [
79
- [:UP1, p1], [:UP2, p2], [:UP3, p3],
80
- [:BP1, p1, p2], [:BP2, p2, p3],
81
- [:UW1, w1], [:UW2, w2], [:UW3, w3], [:UW4, w4], [:UW5, w5], [:UW6, w6],
82
- [:BW1, w2, w3], [:BW2, w3, w4], [:BW3, w4, w5],
83
- [:TW1, w1, w2, w3], [:TW2, w2, w3, w4], [:TW3, w3, w4, w5], [:TW4, w4, w5, w6],
84
- [:UC1, c1], [:UC2, c2], [:UC3, c3], [:UC4, c4], [:UC5, c5], [:UC6, c6],
85
- [:BC1, c2, c3], [:BC2, c3, c4], [:BC3, c4, c5],
86
- [:TC1, c1, c2, c3], [:TC2, c2, c3, c4], [:TC3, c3, c4, c5], [:TC4, c4, c5, c6],
87
- [:UQ1, p1, c1], [:UQ2, p2, c2], [:UQ3, p3, c3],
88
- [:BQ1, p2, c2, c3], [:BQ2, p2, c3, c4], [:BQ3, p3, c2, c3], [:BQ4, p3, c3, c4],
89
- [:TQ1, p2, c1, c2, c3], [:TQ2, p2, c2, c3, c4], [:TQ3, p3, c1, c2, c3], [:TQ4, p3, c2, c3, c4],
90
- ].each do |category_and_pattern|
91
- category = category_and_pattern[0]
92
- pattern = category_and_pattern[1..-1].join("")
93
- score += @model.score(category, pattern)
94
- end
84
+ score += @model.score(:UP1, p1)
85
+ score += @model.score(:UP2, p2)
86
+ score += @model.score(:UP3, p3)
87
+ score += @model.score(:BP1, p1 + p2)
88
+ score += @model.score(:BP2, p2 + p3)
89
+ score += @model.score(:UW1, w1)
90
+ score += @model.score(:UW2, w2)
91
+ score += @model.score(:UW3, w3)
92
+ score += @model.score(:UW4, w4)
93
+ score += @model.score(:UW5, w5)
94
+ score += @model.score(:UW6, w6)
95
+ score += @model.score(:BW1, w2 + w3)
96
+ score += @model.score(:BW2, w3 + w4)
97
+ score += @model.score(:BW3, w4 + w5)
98
+ score += @model.score(:TW1, w1 + w2 + w3)
99
+ score += @model.score(:TW2, w2 + w3 + w4)
100
+ score += @model.score(:TW3, w3 + w4 + w5)
101
+ score += @model.score(:TW4, w4 + w5 + w6)
102
+ score += @model.score(:UC1, c1)
103
+ score += @model.score(:UC2, c2)
104
+ score += @model.score(:UC3, c3)
105
+ score += @model.score(:UC4, c4)
106
+ score += @model.score(:UC5, c5)
107
+ score += @model.score(:UC6, c6)
108
+ score += @model.score(:BC1, c2 + c3)
109
+ score += @model.score(:BC2, c3 + c4)
110
+ score += @model.score(:BC3, c4 + c5)
111
+ score += @model.score(:TC1, c1 + c2 + c3)
112
+ score += @model.score(:TC2, c2 + c3 + c4)
113
+ score += @model.score(:TC3, c3 + c4 + c5)
114
+ score += @model.score(:TC4, c4 + c5 + c6)
115
+ score += @model.score(:UQ1, p1 + c1)
116
+ score += @model.score(:UQ2, p2 + c2)
117
+ score += @model.score(:UQ3, p3 + c3)
118
+ score += @model.score(:BQ1, p2 + c2 + c3)
119
+ score += @model.score(:BQ2, p2 + c3 + c4)
120
+ score += @model.score(:BQ3, p3 + c2 + c3)
121
+ score += @model.score(:BQ4, p3 + c3 + c4)
122
+ score += @model.score(:TQ1, p2 + c1 + c2 + c3)
123
+ score += @model.score(:TQ2, p2 + c2 + c3 + c4)
124
+ score += @model.score(:TQ3, p3 + c1 + c2 + c3)
125
+ score += @model.score(:TQ4, p3 + c2 + c3 + c4)
95
126
  score
96
127
  end
97
128
  end
@@ -2,51 +2,53 @@
2
2
 
3
3
  class SegmentationModel
4
4
  def initialize
5
- @BC1 = {"HH" => 6,"II" => 2461,"KH" => 406,"OH" => -1378}
6
- @BC2 = {"AA" => -3267,"AI" => 2744,"AN" => -878,"HH" => -4070,"HM" => -1711,"HN" => 4012,"HO" => 3761,"IA" => 1327,"IH" => -1184,"II" => -1332,"IK" => 1721,"IO" => 5492,"KI" => 3831,"KK" => -8741,"MH" => -3132,"MK" => 3334,"OO" => -2920}
7
- @BC3 = {"HH" => 996,"HI" => 626,"HK" => -721,"HN" => -1307,"HO" => -836,"IH" => -301,"KK" => 2762,"MK" => 1079,"MM" => 4034,"OA" => -1652,"OH" => 266}
8
- @BP1 = {"BB" => 295,"OB" => 304,"OO" => -125,"UB" => 352}
9
- @BP2 = {"BO" => 60,"OO" => -1762}
10
- @BQ1 = {"BHH" => 1150,"BHM" => 1521,"BII" => -1158,"BIM" => 886,"BMH" => 1208,"BNH" => 449,"BOH" => -91,"BOO" => -2597,"OHI" => 451,"OIH" => -296,"OKA" => 1851,"OKH" => -1020,"OKK" => 904,"OOO" => 2965}
11
- @BQ2 = {"BHH" => 118,"BHI" => -1159,"BHM" => 466,"BIH" => -919,"BKK" => -1720,"BKO" => 864,"OHH" => -1139,"OHM" => -181,"OIH" => 153,"UHI" => -1146}
12
- @BQ3 = {"BHH" => -792,"BHI" => 2664,"BII" => -299,"BKI" => 419,"BMH" => 937,"BMM" => 8335,"BNN" => 998,"BOH" => 775,"OHH" => 2174,"OHM" => 439,"OII" => 280,"OKH" => 1798,"OKI" => -793,"OKO" => -2242,"OMH" => -2402,"OOO" => 11699}
13
- @BQ4 = {"BHH" => -3895,"BIH" => 3761,"BII" => -4654,"BIK" => 1348,"BKK" => -1806,"BMI" => -3385,"BOO" => -12396,"OAH" => 926,"OHH" => 266,"OHK" => -2036,"ONN" => -973}
14
- @BW1 = {",と" => 660,",同" => 727,"B1あ" => 1404,"B1同" => 542,"、と" => 660,"、同" => 727,"」と" => 1682,"あっ" => 1505,"いう" => 1743,"いっ" => -2055,"いる" => 672,"うし" => -4817,"うん" => 665,"から" => 3472,"がら" => 600,"こう" => -790,"こと" => 2083,"こん" => -1262,"さら" => -4143,"さん" => 4573,"した" => 2641,"して" => 1104,"すで" => -3399,"そこ" => 1977,"それ" => -871,"たち" => 1122,"ため" => 601,"った" => 3463,"つい" => -802,"てい" => 805,"てき" => 1249,"でき" => 1127,"です" => 3445,"では" => 844,"とい" => -4915,"とみ" => 1922,"どこ" => 3887,"ない" => 5713,"なっ" => 3015,"など" => 7379,"なん" => -1113,"にし" => 2468,"には" => 1498,"にも" => 1671,"に対" => -912,"の一" => -501,"の中" => 741,"ませ" => 2448,"まで" => 1711,"まま" => 2600,"まる" => -2155,"やむ" => -1947,"よっ" => -2565,"れた" => 2369,"れで" => -913,"をし" => 1860,"を見" => 731,"亡く" => -1886,"京都" => 2558,"取り" => -2784,"大き" => -2604,"大阪" => 1497,"平方" => -2314,"引き" => -1336,"日本" => -195,"本当" => -2423,"毎日" => -2113,"目指" => -724,"B1あ" => 1404,"B1同" => 542,"」と" => 1682}
15
- @BW2 = {".." => -11822,"11" => -669,"――" => -5730,"−−" => -13175,"いう" => -1609,"うか" => 2490,"かし" => -1350,"かも" => -602,"から" => -7194,"かれ" => 4612,"がい" => 853,"がら" => -3198,"きた" => 1941,"くな" => -1597,"こと" => -8392,"この" => -4193,"させ" => 4533,"され" => 13168,"さん" => -3977,"しい" => -1819,"しか" => -545,"した" => 5078,"して" => 972,"しな" => 939,"その" => -3744,"たい" => -1253,"たた" => -662,"ただ" => -3857,"たち" => -786,"たと" => 1224,"たは" => -939,"った" => 4589,"って" => 1647,"っと" => -2094,"てい" => 6144,"てき" => 3640,"てく" => 2551,"ては" => -3110,"ても" => -3065,"でい" => 2666,"でき" => -1528,"でし" => -3828,"です" => -4761,"でも" => -4203,"とい" => 1890,"とこ" => -1746,"とと" => -2279,"との" => 720,"とみ" => 5168,"とも" => -3941,"ない" => -2488,"なが" => -1313,"など" => -6509,"なの" => 2614,"なん" => 3099,"にお" => -1615,"にし" => 2748,"にな" => 2454,"によ" => -7236,"に対" => -14943,"に従" => -4688,"に関" => -11388,"のか" => 2093,"ので" => -7059,"のに" => -6041,"のの" => -6125,"はい" => 1073,"はが" => -1033,"はず" => -2532,"ばれ" => 1813,"まし" => -1316,"まで" => -6621,"まれ" => 5409,"めて" => -3153,"もい" => 2230,"もの" => -10713,"らか" => -944,"らし" => -1611,"らに" => -1897,"りし" => 651,"りま" => 1620,"れた" => 4270,"れて" => 849,"れば" => 4114,"ろう" => 6067,"われ" => 7901,"を通" => -11877,"んだ" => 728,"んな" => -4115,"一人" => 602,"一方" => -1375,"一日" => 970,"一部" => -1051,"上が" => -4479,"会社" => -1116,"出て" => 2163,"分の" => -7758,"同党" => 970,"同日" => -913,"大阪" => -2471,"委員" => -1250,"少な" => -1050,"年度" => -8669,"年間" => -1626,"府県" => -2363,"手権" => -1982,"新聞" => -4066,"日新" => -722,"日本" => -7068,"日米" => 3372,"曜日" => -601,"朝鮮" => -2355,"本人" => -2697,"東京" => -1543,"然と" => -1384,"社会" => -1276,"立て" => -990,"第に" => -1612,"米国" => -4268,"11" => -669}
16
- @BW3 = {"あた" => -2194,"あり" => 719,"ある" => 3846,"い." => -1185,"い。" => -1185,"いい" => 5308,"いえ" => 2079,"いく" => 3029,"いた" => 2056,"いっ" => 1883,"いる" => 5600,"いわ" => 1527,"うち" => 1117,"うと" => 4798,"えと" => 1454,"か." => 2857,"か。" => 2857,"かけ" => -743,"かっ" => -4098,"かに" => -669,"から" => 6520,"かり" => -2670,"が," => 1816,"が、" => 1816,"がき" => -4855,"がけ" => -1127,"がっ" => -913,"がら" => -4977,"がり" => -2064,"きた" => 1645,"けど" => 1374,"こと" => 7397,"この" => 1542,"ころ" => -2757,"さい" => -714,"さを" => 976,"し," => 1557,"し、" => 1557,"しい" => -3714,"した" => 3562,"して" => 1449,"しな" => 2608,"しま" => 1200,"す." => -1310,"す。" => -1310,"する" => 6521,"ず," => 3426,"ず、" => 3426,"ずに" => 841,"そう" => 428,"た." => 8875,"た。" => 8875,"たい" => -594,"たの" => 812,"たり" => -1183,"たる" => -853,"だ." => 4098,"だ。" => 4098,"だっ" => 1004,"った" => -4748,"って" => 300,"てい" => 6240,"てお" => 855,"ても" => 302,"です" => 1437,"でに" => -1482,"では" => 2295,"とう" => -1387,"とし" => 2266,"との" => 541,"とも" => -3543,"どう" => 4664,"ない" => 1796,"なく" => -903,"など" => 2135,"に," => -1021,"に、" => -1021,"にし" => 1771,"にな" => 1906,"には" => 2644,"の," => -724,"の、" => -724,"の子" => -1000,"は," => 1337,"は、" => 1337,"べき" => 2181,"まし" => 1113,"ます" => 6943,"まっ" => -1549,"まで" => 6154,"まれ" => -793,"らし" => 1479,"られ" => 6820,"るる" => 3818,"れ," => 854,"れ、" => 854,"れた" => 1850,"れて" => 1375,"れば" => -3246,"れる" => 1091,"われ" => -605,"んだ" => 606,"んで" => 798,"カ月" => 990,"会議" => 860,"入り" => 1232,"大会" => 2217,"始め" => 1681,"" => 965,"新聞" => -5055,"日," => 974,"日、" => 974,"社会" => 2024,"カ月" => 990}
17
- @TC1 = {"AAA" => 1093,"HHH" => 1029,"HHM" => 580,"HII" => 998,"HOH" => -390,"HOM" => -331,"IHI" => 1169,"IOH" => -142,"IOI" => -1015,"IOM" => 467,"MMH" => 187,"OOI" => -1832}
18
- @TC2 = {"HHO" => 2088,"HII" => -1023,"HMM" => -1154,"IHI" => -1965,"KKH" => 703,"OII" => -2649}
19
- @TC3 = {"AAA" => -294,"HHH" => 346,"HHI" => -341,"HII" => -1088,"HIK" => 731,"HOH" => -1486,"IHH" => 128,"IHI" => -3041,"IHO" => -1935,"IIH" => -825,"IIM" => -1035,"IOI" => -542,"KHH" => -1216,"KKA" => 491,"KKH" => -1217,"KOK" => -1009,"MHH" => -2694,"MHM" => -457,"MHO" => 123,"MMH" => -471,"NNH" => -1689,"NNO" => 662,"OHO" => -3393}
20
- @TC4 = {"HHH" => -203,"HHI" => 1344,"HHK" => 365,"HHM" => -122,"HHN" => 182,"HHO" => 669,"HIH" => 804,"HII" => 679,"HOH" => 446,"IHH" => 695,"IHO" => -2324,"IIH" => 321,"III" => 1497,"IIO" => 656,"IOO" => 54,"KAK" => 4845,"KKA" => 3386,"KKK" => 3065,"MHH" => -405,"MHI" => 201,"MMH" => -241,"MMM" => 661,"MOM" => 841}
21
- @TQ1 = {"BHHH" => -227,"BHHI" => 316,"BHIH" => -132,"BIHH" => 60,"BIII" => 1595,"BNHH" => -744,"BOHH" => 225,"BOOO" => -908,"OAKK" => 482,"OHHH" => 281,"OHIH" => 249,"OIHI" => 200,"OIIH" => -68}
22
- @TQ2 = {"BIHH" => -1401,"BIII" => -1033,"BKAK" => -543,"BOOO" => -5591}
23
- @TQ3 = {"BHHH" => 478,"BHHM" => -1073,"BHIH" => 222,"BHII" => -504,"BIIH" => -116,"BIII" => -105,"BMHI" => -863,"BMHM" => -464,"BOMH" => 620,"OHHH" => 346,"OHHI" => 1729,"OHII" => 997,"OHMH" => 481,"OIHH" => 623,"OIIH" => 1344,"OKAK" => 2792,"OKHH" => 587,"OKKA" => 679,"OOHH" => 110,"OOII" => -685}
24
- @TQ4 = {"BHHH" => -721,"BHHM" => -3604,"BHII" => -966,"BIIH" => -607,"BIII" => -2181,"OAAA" => -2763,"OAKK" => 180,"OHHH" => -294,"OHHI" => 2446,"OHHO" => 480,"OHIH" => -1573,"OIHH" => 1935,"OIHI" => -493,"OIIH" => 626,"OIII" => -4007,"OKAK" => -8156}
25
- @TW1 = {"につい" => -4681,"東京都" => 2026}
26
- @TW2 = {"ある程" => -2049,"いった" => -1256,"ころが" => -2434,"しょう" => 3873,"その後" => -4430,"だって" => -1049,"ていた" => 1833,"として" => -4657,"ともに" => -4517,"もので" => 1882,"一気に" => -792,"初めて" => -1512,"同時に" => -8097,"大きな" => -1255,"対して" => -2721,"社会党" => -3216}
27
- @TW3 = {"いただ" => -1734,"してい" => 1314,"として" => -4314,"につい" => -5483,"にとっ" => -5989,"に当た" => -6247,"ので," => -727,"ので、" => -727,"のもの" => -600,"れから" => -3752,"十二月" => -2287}
28
- @TW4 = {"いう." => 8576,"いう。" => 8576,"からな" => -2348,"してい" => 2958,"たが," => 1516,"たが、" => 1516,"ている" => 1538,"という" => 1349,"ました" => 5543,"ません" => 1097,"ようと" => -4258,"よると" => 5865}
29
- @UC1 = {"A" => 484,"K" => 93,"M" => 645,"O" => -505}
30
- @UC2 = {"A" => 819,"H" => 1059,"I" => 409,"M" => 3987,"N" => 5775,"O" => 646}
31
- @UC3 = {"A" => -1370,"I" => 2311}
32
- @UC4 = {"A" => -2643,"H" => 1809,"I" => -1032,"K" => -3450,"M" => 3565,"N" => 3876,"O" => 6646}
33
- @UC5 = {"H" => 313,"I" => -1238,"K" => -799,"M" => 539,"O" => -831}
34
- @UC6 = {"H" => -506,"I" => -253,"K" => 87,"M" => 247,"O" => -387}
35
- @UP1 = {"O" => -214}
36
- @UP2 = {"B" => 69,"O" => 935}
37
- @UP3 = {"B" => 189}
38
- @UQ1 = {"BH" => 21,"BI" => -12,"BK" => -99,"BN" => 142,"BO" => -56,"OH" => -95,"OI" => 477,"OK" => 410,"OO" => -2422}
39
- @UQ2 = {"BH" => 216,"BI" => 113,"OK" => 1759}
40
- @UQ3 = {"BA" => -479,"BH" => 42,"BI" => 1913,"BK" => -7198,"BM" => 3160,"BN" => 6427,"BO" => 14761,"OI" => -827,"ON" => -3212}
41
- @UW1 = {"," => 156,"、" => 156,"「" => -463,"" => -941,"" => -127,"" => -553,"" => 121,"" => 505,"" => -201,"" => -547,"" => -123,"に" => -789,"の" => -185,"は" => -847,"も" => -466,"や" => -470,"よ" => 182,"ら" => -292,"り" => 208,"れ" => 169,"を" => -446,"ん" => -137,"・" => -135,"主" => -402,"京" => -268,"区" => -912,"午" => 871,"国" => -460,"大" => 561,"委" => 729,"市" => -411,"日" => -141,"理" => 361,"生" => -408,"県" => -386,"都" => -718,"「" => -463,"・" => -135}
42
- @UW2 = {"," => -829,"、" => -829,"〇" => 892,"「" => -645,"」" => 3145,"あ" => -538,"い" => 505,"う" => 134,"お" => -502,"か" => 1454,"が" => -856,"" => -412,"こ" => 1141,"さ" => 878,"ざ" => 540,"し" => 1529,"す" => -675,"せ" => 300,"そ" => -1011,"た" => 188,"だ" => 1837,"つ" => -949,"て" => -291,"で" => -268,"と" => -981,"ど" => 1273,"な" => 1063,"に" => -1764,"の" => 130,"は" => -409,"ひ" => -1273,"べ" => 1261,"ま" => 600,"も" => -1263,"や" => -402,"よ" => 1639,"" => -579,"" => -694,"れ" => 571,"を" => -2516,"ん" => 2095,"ア" => -587,"" => 306,"キ" => 568,"ッ" => 831,"三" => -758,"不" => -2150,"世" => -302,"中" => -968,"主" => -861,"" => 492,"人" => -123,"" => 978,"保" => 362,"入" => 548,"初" => -3025,"" => -1566,"" => -3414,"区" => -422,"大" => -1769,"" => -865,"太" => -483,"子" => -1519,"学" => 760,"実" => 1023,"小" => -2009,"市" => -813,"年" => -1060,"強" => 1067,"手" => -1519,"揺" => -1033,"政" => 1522,"文" => -1355,"新" => -1682,"日" => -1815,"明" => -1462,"最" => -630,"朝" => -1843,"本" => -1650,"東" => -931,"果" => -665,"次" => -2378,"民" => -180,"気" => -1740,"理" => 752,"" => 529,"目" => -1584,"相" => -242,"県" => -1165,"" => -763,"第" => 810,"米" => 509,"自" => -1353,"行" => 838,"西" => -744,"見" => -3874,"調" => 1010,"議" => 1198,"込" => 3041,"開" => 1758,"間" => -1257,"「" => -645,"" => 3145,"ッ" => 831,"ア" => -587,"カ" => 306,"キ" => 568}
43
- @UW3 = {"," => 4889,"1" => -800,"" => -1723,"" => 4889,"" => -2311,"〇" => 5827,"」" => 2670,"〓" => -3573,"あ" => -2696,"い" => 1006,"う" => 2342,"え" => 1983,"お" => -4864,"か" => -1163,"が" => 3271,"く" => 1004,"け" => 388,"げ" => 401,"こ" => -3552,"" => -3116,"" => -1058,"し" => -395,"す" => 584,"せ" => 3685,"そ" => -5228,"た" => 842,"" => -521,"っ" => -1444,"つ" => -1081,"て" => 6167,"で" => 2318,"と" => 1691,"ど" => -899,"な" => -2788,"に" => 2745,"の" => 4056,"は" => 4555,"ひ" => -2171,"" => -1798,"へ" => 1199,"ほ" => -5516,"ま" => -4384,"" => -120,"め" => 1205,"も" => 2323,"や" => -788,"よ" => -202,"ら" => 727,"り" => 649,"る" => 5905,"れ" => 2773,"" => -1207,"を" => 6620,"ん" => -518,"ア" => 551,"" => 1319,"" => 874,"ッ" => -1350,"" => 521,"ム" => 1109,"ル" => 1591,"ロ" => 2201,"ン" => 278,"・" => -3794,"" => -1619,"下" => -1759,"世" => -2087,"両" => 3815,"中" => 653,"主" => -758,"" => -1193,"二" => 974,"人" => 2742,"今" => 792,"他" => 1889,"以" => -1368,"" => 811,"何" => 4265,"作" => -361,"保" => -2439,"" => 4858,"" => 3593,"全" => 1574,"公" => -3030,"六" => 755,"共" => -1880,"円" => 5807,"再" => 3095,"分" => 457,"初" => 2475,"別" => 1129,"前" => 2286,"副" => 4437,"力" => 365,"動" => -949,"務" => -1872,"化" => 1327,"北" => -1038,"区" => 4646,"" => -2309,"" => -783,"" => -1006,"口" => 483,"" => 1233,"各" => 3588,"合" => -241,"同" => 3906,"和" => -837,"員" => 4513,"国" => 642,"型" => 1389,"場" => 1219,"外" => -241,"妻" => 2016,"学" => -1356,"安" => -423,"実" => -1008,"家" => 1078,"小" => -513,"少" => -3102,"州" => 1155,"市" => 3197,"平" => -1804,"年" => 2416,"広" => -1030,"" => 1605,"" => 1452,"建" => -2352,"" => -3885,"得" => 1905,"思" => -1291,"性" => 1822,"戸" => -488,"指" => -3973,"政" => -2013,"教" => -1479,"数" => 3222,"文" => -1489,"新" => 1764,"日" => 2099,"" => 5792,"昨" => -661,"" => -1248,"" => -951,"" => -937,"月" => 4125,"期" => 360,"李" => 3094,"村" => 364,"東" => -805,"" => 5156,"" => 2438,"業" => 484,"氏" => 2613,"民" => -1694,"" => -1073,"" => 1868,"" => -495,"" => 979,"物" => 461,"特" => -3850,"" => -273,"用" => 914,"町" => 1215,"的" => 7313,"直" => -1835,"省" => 792,"県" => 6293,"知" => -1528,"私" => 4231,"税" => 401,"立" => -960,"第" => 1201,"米" => 7767,"" => 3066,"約" => 3663,"級" => 1384,"統" => -4229,"" => 1163,"" => 1255,"者" => 6457,"能" => 725,"自" => -2869,"英" => 785,"見" => 1044,"調" => -562,"" => -733,"費" => 1777,"車" => 1835,"軍" => 1375,"込" => -1504,"通" => -1136,"選" => -681,"郎" => 1026,"郡" => 4404,"部" => 1200,"金" => 2163,"長" => 421,"開" => -1432,"間" => 1302,"関" => -1282,"" => 2009,"電" => -1045,"非" => 2066,"駅" => 1620,"1" => -800,"」" => 2670,"・" => -3794,"ッ" => -1350,"ア" => 551,"グ" => 1319,"" => 874,"ト" => 521,"ム" => 1109,"ル" => 1591,"ロ" => 2201,"ン" => 278}
44
- @UW4 = {"," => 3930,"." => 3508,"" => -4841,"、" => 3930,"" => 3508,"〇" => 4999,"「" => 1895,"」" => 3798,"〓" => -5156,"あ" => 4752,"い" => -3435,"う" => -640,"え" => -2514,"お" => 2405,"か" => 530,"が" => 6006,"き" => -4482,"ぎ" => -3821,"く" => -3788,"け" => -4376,"げ" => -4734,"こ" => 2255,"ご" => 1979,"さ" => 2864,"し" => -843,"じ" => -2506,"す" => -731,"ず" => 1251,"せ" => 181,"そ" => 4091,"た" => 5034,"だ" => 5408,"ち" => -3654,"っ" => -5882,"つ" => -1659,"て" => 3994,"で" => 7410,"と" => 4547,"" => 5433,"" => 6499,"" => 1853,"ね" => 1413,"の" => 7396,"は" => 8578,"" => 1940,"" => 4249,"" => -4134,"" => 1345,"" => 6665,"" => -744,"" => 1464,"" => 1051,"" => -2082,"" => -882,"" => -5046,"" => 4169,"" => -2666,"" => 2795,"" => -1544,"" => 3351,"" => -2922,"" => -9726,"" => -14896,"" => -2613,"" => -4570,"" => -1783,"" => 13150,"" => -2352,"" => 2145,"" => 1789,"" => 1287,"" => -724,"" => -403,"" => -1635,"" => -881,"" => -541,"" => -856,"" => -3637,"" => -4371,"" => -11870,"" => -2069,"" => 2210,"" => 782,"" => -190,"" => -1768,"" => 1036,"" => 544,"" => 950,"" => -1286,"" => 530,"" => 4292,"" => 601,"" => -2006,"" => -1212,"" => 584,"" => 788,"初" => 1347,"前" => 1623,"副" => 3879,"力" => -302,"動" => -740,"務" => -2715,"化" => 776,"" => 4517,"" => 1013,"" => 1555,"" => -1834,"" => -681,"" => -910,"" => -851,"" => 1500,"" => -619,"" => -1200,"" => 866,"" => -1410,"" => -2094,"" => -1413,"" => 1067,"" => 571,"" => -4802,"学" => -1397,"" => -1057,"" => -809,"" => 1910,"" => -1328,"" => -1500,"" => -2056,"" => -2667,"" => 2771,"年" => 374,"" => -4556,"" => 456,"" => 553,"" => 916,"" => -1566,"" => 856,"" => 787,"政" => 2182,"教" => 704,"" => 522,"" => -856,"" => 1798,"" => 1829,"" => 845,"" => -9066,"" => -485,"" => -442,"" => -360,"" => -1043,"" => 5388,"" => -2716,"" => -910,"" => -939,"" => -543,"" => -735,"" => 672,"" => -1267,"" => -1286,"" => -1101,"" => -2900,"" => 1826,"" => 2586,"" => 922,"" => -3485,"" => 2997,"" => -867,"" => -2112,"" => 788,"" => 2937,"" => 786,"" => 2171,"" => 1146,"" => -1169,"" => 940,"" => -994,"" => 749,"" => 2145,"" => -730,"" => -852,"" => -792,"" => 792,"" => -1184,"" => -244,"" => -1000,"" => 730,"" => -1481,"" => 1158,"" => -1433,"" => -3370,"" => 929,"" => -1291,"" => 2596,"" => -4866,"" => 1192,"" => -1100,"" => -2213,"" => 357,"" => -2344,"" => -2297,"" => -2604,"" => -878,"" => -1659,"" => -792,"" => -1984,"" => 1749,"" => 2120,"" => 1895,"" => 3798,"" => -4371,"" => -724,"" => -11870,"" => 2145,"" => 1789,"" => 1287,"" => -403,"" => -1635,"" => -881,"" => -541,"" => -856,"ン" => -3637}
45
- @UW5 = {"," => 465,"." => -299,"1" => -514,"E2" => -32768,"]" => -2762,"" => 465,"" => -299,"" => 363,"あ" => 1655,"い" => 331,"う" => -503,"え" => 1199,"お" => 527,"か" => 647,"が" => -421,"き" => 1624,"ぎ" => 1971,"く" => 312,"げ" => -983,"さ" => -1537,"し" => -1371,"す" => -852,"だ" => -1186,"ち" => 1093,"っ" => 52,"つ" => 921,"て" => -18,"で" => -850,"と" => -127,"" => 1682,"" => -787,"" => -1224,"の" => -635,"は" => -578,"べ" => 1001,"み" => 502,"め" => 865,"ゃ" => 3350,"ょ" => 854,"り" => -208,"る" => 429,"れ" => 504,"わ" => 419,"を" => -1264,"ん" => 327,"" => 241,"ル" => 451,"ン" => -343,"中" => -871,"" => 722,"会" => -1153,"党" => -654,"務" => 3519,"区" => -901,"" => 848,"員" => 2104,"大" => -1296,"学" => -548,"定" => 1785,"" => -1304,"" => -2991,"" => 921,"年" => 1763,"" => 872,"所" => -814,"" => 1618,"" => -1682,"日" => 218,"月" => -4353,"" => 932,"" => 1356,"" => -1508,"氏" => -1347,"田" => 240,"町" => -3912,"的" => -3149,"" => 1319,"省" => -1052,"県" => -4003,"" => -997,"" => -278,"" => -813,"" => 1955,"者" => -2233,"" => 663,"" => -1073,"議" => 1219,"" => -1018,"郎" => -368,"長" => 786,"間" => 1191,"題" => 2368,"館" => -689,"" => -514,"E2" => -32768,"" => 363,"" => 241,"ル" => 451,"ン" => -343}
46
- @UW6 = {"," => 227,"." => 808,"1" => -270,"E1" => 306,"、" => 227,"。" => 808,"あ" => -307,"う" => 189,"か" => 241,"が" => -73,"く" => -121,"" => -200,"" => 1782,"す" => 383,"" => -428,"っ" => 573,"て" => -1014,"で" => 101,"と" => -105,"な" => -253,"に" => -149,"の" => -417,"は" => -236,"" => -206,"り" => 187,"る" => -135,"を" => 195,"ル" => -673,"ン" => -496,"" => -277,"" => 201,"" => -800,"" => 624,"" => 302,"区" => 1792,"員" => -1212,"" => 798,"学" => -960,"市" => 887,"" => -695,"" => 535,"" => -697,"相" => 753,"" => -507,"" => 974,"空" => -822,"者" => 1811,"" => 463,"郎" => 1082,"1" => -270,"E1" => 306,"ル" => -673,"ン" => -496}
5
+ @models = {
6
+ :BC1 => {"HH" => 6,"II" => 2461,"KH" => 406,"OH" => -1378},
7
+ :BC2 => {"AA" => -3267,"AI" => 2744,"AN" => -878,"HH" => -4070,"HM" => -1711,"HN" => 4012,"HO" => 3761,"IA" => 1327,"IH" => -1184,"II" => -1332,"IK" => 1721,"IO" => 5492,"KI" => 3831,"KK" => -8741,"MH" => -3132,"MK" => 3334,"OO" => -2920},
8
+ :BC3 => {"HH" => 996,"HI" => 626,"HK" => -721,"HN" => -1307,"HO" => -836,"IH" => -301,"KK" => 2762,"MK" => 1079,"MM" => 4034,"OA" => -1652,"OH" => 266},
9
+ :BP1 => {"BB" => 295,"OB" => 304,"OO" => -125,"UB" => 352},
10
+ :BP2 => {"BO" => 60,"OO" => -1762},
11
+ :BQ1 => {"BHH" => 1150,"BHM" => 1521,"BII" => -1158,"BIM" => 886,"BMH" => 1208,"BNH" => 449,"BOH" => -91,"BOO" => -2597,"OHI" => 451,"OIH" => -296,"OKA" => 1851,"OKH" => -1020,"OKK" => 904,"OOO" => 2965},
12
+ :BQ2 => {"BHH" => 118,"BHI" => -1159,"BHM" => 466,"BIH" => -919,"BKK" => -1720,"BKO" => 864,"OHH" => -1139,"OHM" => -181,"OIH" => 153,"UHI" => -1146},
13
+ :BQ3 => {"BHH" => -792,"BHI" => 2664,"BII" => -299,"BKI" => 419,"BMH" => 937,"BMM" => 8335,"BNN" => 998,"BOH" => 775,"OHH" => 2174,"OHM" => 439,"OII" => 280,"OKH" => 1798,"OKI" => -793,"OKO" => -2242,"OMH" => -2402,"OOO" => 11699},
14
+ :BQ4 => {"BHH" => -3895,"BIH" => 3761,"BII" => -4654,"BIK" => 1348,"BKK" => -1806,"BMI" => -3385,"BOO" => -12396,"OAH" => 926,"OHH" => 266,"OHK" => -2036,"ONN" => -973},
15
+ :BW1 => {",と" => 660,",同" => 727,"B1あ" => 1404,"B1同" => 542,"、と" => 660,"、同" => 727,"」と" => 1682,"あっ" => 1505,"いう" => 1743,"いっ" => -2055,"いる" => 672,"うし" => -4817,"うん" => 665,"から" => 3472,"がら" => 600,"こう" => -790,"こと" => 2083,"こん" => -1262,"さら" => -4143,"さん" => 4573,"した" => 2641,"して" => 1104,"すで" => -3399,"そこ" => 1977,"それ" => -871,"たち" => 1122,"ため" => 601,"った" => 3463,"つい" => -802,"てい" => 805,"てき" => 1249,"でき" => 1127,"です" => 3445,"では" => 844,"とい" => -4915,"とみ" => 1922,"どこ" => 3887,"ない" => 5713,"なっ" => 3015,"など" => 7379,"なん" => -1113,"にし" => 2468,"には" => 1498,"にも" => 1671,"に対" => -912,"の一" => -501,"の中" => 741,"ませ" => 2448,"まで" => 1711,"まま" => 2600,"まる" => -2155,"やむ" => -1947,"よっ" => -2565,"れた" => 2369,"れで" => -913,"をし" => 1860,"を見" => 731,"亡く" => -1886,"京都" => 2558,"取り" => -2784,"大き" => -2604,"大阪" => 1497,"平方" => -2314,"引き" => -1336,"日本" => -195,"本当" => -2423,"毎日" => -2113,"目指" => -724,"B1あ" => 1404,"B1同" => 542,"」と" => 1682},
16
+ :BW2 => {".." => -11822,"11" => -669,"――" => -5730,"−−" => -13175,"いう" => -1609,"うか" => 2490,"かし" => -1350,"かも" => -602,"から" => -7194,"かれ" => 4612,"がい" => 853,"がら" => -3198,"きた" => 1941,"くな" => -1597,"こと" => -8392,"この" => -4193,"させ" => 4533,"され" => 13168,"さん" => -3977,"しい" => -1819,"しか" => -545,"した" => 5078,"して" => 972,"しな" => 939,"その" => -3744,"たい" => -1253,"たた" => -662,"ただ" => -3857,"たち" => -786,"たと" => 1224,"たは" => -939,"った" => 4589,"って" => 1647,"っと" => -2094,"てい" => 6144,"てき" => 3640,"てく" => 2551,"ては" => -3110,"ても" => -3065,"でい" => 2666,"でき" => -1528,"でし" => -3828,"です" => -4761,"でも" => -4203,"とい" => 1890,"とこ" => -1746,"とと" => -2279,"との" => 720,"とみ" => 5168,"とも" => -3941,"ない" => -2488,"なが" => -1313,"など" => -6509,"なの" => 2614,"なん" => 3099,"にお" => -1615,"にし" => 2748,"にな" => 2454,"によ" => -7236,"に対" => -14943,"に従" => -4688,"に関" => -11388,"のか" => 2093,"ので" => -7059,"のに" => -6041,"のの" => -6125,"はい" => 1073,"はが" => -1033,"はず" => -2532,"ばれ" => 1813,"まし" => -1316,"まで" => -6621,"まれ" => 5409,"めて" => -3153,"もい" => 2230,"もの" => -10713,"らか" => -944,"らし" => -1611,"らに" => -1897,"りし" => 651,"りま" => 1620,"れた" => 4270,"れて" => 849,"れば" => 4114,"ろう" => 6067,"われ" => 7901,"を通" => -11877,"んだ" => 728,"んな" => -4115,"一人" => 602,"一方" => -1375,"一日" => 970,"一部" => -1051,"上が" => -4479,"会社" => -1116,"出て" => 2163,"分の" => -7758,"同党" => 970,"同日" => -913,"大阪" => -2471,"委員" => -1250,"少な" => -1050,"年度" => -8669,"年間" => -1626,"府県" => -2363,"手権" => -1982,"新聞" => -4066,"日新" => -722,"日本" => -7068,"日米" => 3372,"曜日" => -601,"朝鮮" => -2355,"本人" => -2697,"東京" => -1543,"然と" => -1384,"社会" => -1276,"立て" => -990,"第に" => -1612,"米国" => -4268,"11" => -669},
17
+ :BW3 => {"あた" => -2194,"あり" => 719,"ある" => 3846,"い." => -1185,"い。" => -1185,"いい" => 5308,"いえ" => 2079,"いく" => 3029,"いた" => 2056,"いっ" => 1883,"いる" => 5600,"いわ" => 1527,"うち" => 1117,"うと" => 4798,"えと" => 1454,"か." => 2857,"か。" => 2857,"かけ" => -743,"かっ" => -4098,"かに" => -669,"から" => 6520,"かり" => -2670,"が," => 1816,"が、" => 1816,"がき" => -4855,"がけ" => -1127,"がっ" => -913,"がら" => -4977,"がり" => -2064,"きた" => 1645,"けど" => 1374,"こと" => 7397,"この" => 1542,"ころ" => -2757,"さい" => -714,"さを" => 976,"し," => 1557,"し、" => 1557,"しい" => -3714,"した" => 3562,"して" => 1449,"しな" => 2608,"しま" => 1200,"す." => -1310,"す。" => -1310,"する" => 6521,"ず," => 3426,"ず、" => 3426,"ずに" => 841,"そう" => 428,"た." => 8875,"た。" => 8875,"たい" => -594,"たの" => 812,"たり" => -1183,"たる" => -853,"だ." => 4098,"だ。" => 4098,"だっ" => 1004,"った" => -4748,"って" => 300,"てい" => 6240,"てお" => 855,"ても" => 302,"です" => 1437,"でに" => -1482,"では" => 2295,"とう" => -1387,"とし" => 2266,"との" => 541,"とも" => -3543,"どう" => 4664,"ない" => 1796,"なく" => -903,"など" => 2135,"に," => -1021,"に、" => -1021,"にし" => 1771,"にな" => 1906,"には" => 2644,"の," => -724,"の、" => -724,"の子" => -1000,"は," => 1337,"は、" => 1337,"べき" => 2181,"まし" => 1113,"ます" => 6943,"まっ" => -1549,"まで" => 6154,"まれ" => -793,"らし" => 1479,"られ" => 6820,"るる" => 3818,"れ," => 854,"れ、" => 854,"れた" => 1850,"れて" => 1375,"れば" => -3246,"れる" => 1091,"われ" => -605,"んだ" => 606,"んで" => 798,"カ月" => 990,"会議" => 860,"入り" => 1232,"大会" => 2217,"始め" => 1681,"市" => 965,"新聞" => -5055,"日," => 974,"日、" => 974,"社会" => 2024,"カ月" => 990},
18
+ :TC1 => {"AAA" => 1093,"HHH" => 1029,"HHM" => 580,"HII" => 998,"HOH" => -390,"HOM" => -331,"IHI" => 1169,"IOH" => -142,"IOI" => -1015,"IOM" => 467,"MMH" => 187,"OOI" => -1832},
19
+ :TC2 => {"HHO" => 2088,"HII" => -1023,"HMM" => -1154,"IHI" => -1965,"KKH" => 703,"OII" => -2649},
20
+ :TC3 => {"AAA" => -294,"HHH" => 346,"HHI" => -341,"HII" => -1088,"HIK" => 731,"HOH" => -1486,"IHH" => 128,"IHI" => -3041,"IHO" => -1935,"IIH" => -825,"IIM" => -1035,"IOI" => -542,"KHH" => -1216,"KKA" => 491,"KKH" => -1217,"KOK" => -1009,"MHH" => -2694,"MHM" => -457,"MHO" => 123,"MMH" => -471,"NNH" => -1689,"NNO" => 662,"OHO" => -3393},
21
+ :TC4 => {"HHH" => -203,"HHI" => 1344,"HHK" => 365,"HHM" => -122,"HHN" => 182,"HHO" => 669,"HIH" => 804,"HII" => 679,"HOH" => 446,"IHH" => 695,"IHO" => -2324,"IIH" => 321,"III" => 1497,"IIO" => 656,"IOO" => 54,"KAK" => 4845,"KKA" => 3386,"KKK" => 3065,"MHH" => -405,"MHI" => 201,"MMH" => -241,"MMM" => 661,"MOM" => 841},
22
+ :TQ1 => {"BHHH" => -227,"BHHI" => 316,"BHIH" => -132,"BIHH" => 60,"BIII" => 1595,"BNHH" => -744,"BOHH" => 225,"BOOO" => -908,"OAKK" => 482,"OHHH" => 281,"OHIH" => 249,"OIHI" => 200,"OIIH" => -68},
23
+ :TQ2 => {"BIHH" => -1401,"BIII" => -1033,"BKAK" => -543,"BOOO" => -5591},
24
+ :TQ3 => {"BHHH" => 478,"BHHM" => -1073,"BHIH" => 222,"BHII" => -504,"BIIH" => -116,"BIII" => -105,"BMHI" => -863,"BMHM" => -464,"BOMH" => 620,"OHHH" => 346,"OHHI" => 1729,"OHII" => 997,"OHMH" => 481,"OIHH" => 623,"OIIH" => 1344,"OKAK" => 2792,"OKHH" => 587,"OKKA" => 679,"OOHH" => 110,"OOII" => -685},
25
+ :TQ4 => {"BHHH" => -721,"BHHM" => -3604,"BHII" => -966,"BIIH" => -607,"BIII" => -2181,"OAAA" => -2763,"OAKK" => 180,"OHHH" => -294,"OHHI" => 2446,"OHHO" => 480,"OHIH" => -1573,"OIHH" => 1935,"OIHI" => -493,"OIIH" => 626,"OIII" => -4007,"OKAK" => -8156},
26
+ :TW1 => {"につい" => -4681,"東京都" => 2026},
27
+ :TW2 => {"ある程" => -2049,"いった" => -1256,"ころが" => -2434,"しょう" => 3873,"その後" => -4430,"だって" => -1049,"ていた" => 1833,"として" => -4657,"ともに" => -4517,"もので" => 1882,"一気に" => -792,"初めて" => -1512,"同時に" => -8097,"大きな" => -1255,"対して" => -2721,"社会党" => -3216},
28
+ :TW3 => {"いただ" => -1734,"してい" => 1314,"として" => -4314,"につい" => -5483,"にとっ" => -5989,"に当た" => -6247,"ので," => -727,"ので、" => -727,"のもの" => -600,"れから" => -3752,"十二月" => -2287},
29
+ :TW4 => {"いう." => 8576,"いう。" => 8576,"からな" => -2348,"してい" => 2958,"たが," => 1516,"たが、" => 1516,"ている" => 1538,"という" => 1349,"ました" => 5543,"ません" => 1097,"ようと" => -4258,"よると" => 5865},
30
+ :UC1 => {"A" => 484,"K" => 93,"M" => 645,"O" => -505},
31
+ :UC2 => {"A" => 819,"H" => 1059,"I" => 409,"M" => 3987,"N" => 5775,"O" => 646},
32
+ :UC3 => {"A" => -1370,"I" => 2311},
33
+ :UC4 => {"A" => -2643,"H" => 1809,"I" => -1032,"K" => -3450,"M" => 3565,"N" => 3876,"O" => 6646},
34
+ :UC5 => {"H" => 313,"I" => -1238,"K" => -799,"M" => 539,"O" => -831},
35
+ :UC6 => {"H" => -506,"I" => -253,"K" => 87,"M" => 247,"O" => -387},
36
+ :UP1 => {"O" => -214},
37
+ :UP2 => {"B" => 69,"O" => 935},
38
+ :UP3 => {"B" => 189},
39
+ :UQ1 => {"BH" => 21,"BI" => -12,"BK" => -99,"BN" => 142,"BO" => -56,"OH" => -95,"OI" => 477,"OK" => 410,"OO" => -2422},
40
+ :UQ2 => {"BH" => 216,"BI" => 113,"OK" => 1759},
41
+ :UQ3 => {"BA" => -479,"BH" => 42,"BI" => 1913,"BK" => -7198,"BM" => 3160,"BN" => 6427,"BO" => 14761,"OI" => -827,"ON" => -3212},
42
+ :UW1 => {"," => 156,"、" => 156,"「" => -463,"あ" => -941,"う" => -127,"が" => -553,"" => 121,"こ" => 505,"で" => -201,"と" => -547,"ど" => -123,"に" => -789,"の" => -185,"は" => -847,"も" => -466,"や" => -470,"よ" => 182,"" => -292,"" => 208,"れ" => 169,"を" => -446,"ん" => -137,"" => -135,"主" => -402,"" => -268,"" => -912,"" => 871,"" => -460,"大" => 561,"" => 729,"市" => -411,"日" => -141,"理" => 361,"" => -408,"県" => -386,"" => -718,"「" => -463,"" => -135},
43
+ :UW2 => {"," => -829,"" => -829,"" => 892,"" => -645,"」" => 3145,"あ" => -538,"い" => 505,"う" => 134,"お" => -502,"か" => 1454,"が" => -856,"く" => -412,"こ" => 1141,"" => 878,"" => 540,"し" => 1529,"す" => -675,"せ" => 300,"そ" => -1011,"た" => 188,"" => 1837,"つ" => -949,"て" => -291,"で" => -268,"と" => -981,"ど" => 1273,"な" => 1063,"に" => -1764,"の" => 130,"は" => -409,"ひ" => -1273,"" => 1261,"ま" => 600,"" => -1263,"や" => -402,"よ" => 1639,"り" => -579,"る" => -694,"れ" => 571,"" => -2516,"ん" => 2095,"ア" => -587,"" => 306,"" => 568,"ッ" => 831,"" => -758,"" => -2150,"世" => -302,"中" => -968,"主" => -861,"" => 492,"人" => -123,"" => 978,"保" => 362,"" => 548,"" => -3025,"副" => -1566,"北" => -3414,"区" => -422,"" => -1769,"" => -865,"" => -483,"" => -1519,"学" => 760,"実" => 1023,"小" => -2009,"市" => -813,"年" => -1060,"" => 1067,"" => -1519,"" => -1033,"政" => 1522,"文" => -1355,"新" => -1682,"日" => -1815,"" => -1462,"" => -630,"" => -1843,"" => -1650,"東" => -931,"" => -665,"" => -2378,"民" => -180,"" => -1740,"" => 752,"" => 529,"" => -1584,"" => -242,"県" => -1165,"立" => -763,"第" => 810,"米" => 509,"" => -1353,"" => 838,"西" => -744,"見" => -3874,"調" => 1010,"" => 1198,"込" => 3041,"開" => 1758,"間" => -1257,"" => -645,"」" => 3145,"ッ" => 831,"ア" => -587,"" => 306,"" => 568},
44
+ :UW3 => {"," => 4889,"1" => -800,"" => -1723,"、" => 4889,"" => -2311,"〇" => 5827,"」" => 2670,"〓" => -3573,"あ" => -2696,"い" => 1006,"う" => 2342,"え" => 1983,"お" => -4864,"か" => -1163,"が" => 3271,"く" => 1004,"け" => 388,"げ" => 401,"こ" => -3552,"ご" => -3116,"さ" => -1058,"し" => -395,"す" => 584,"せ" => 3685,"そ" => -5228,"た" => 842,"ち" => -521,"っ" => -1444,"つ" => -1081,"て" => 6167,"で" => 2318,"と" => 1691,"" => -899,"" => -2788,"" => 2745,"の" => 4056,"は" => 4555,"" => -2171,"" => -1798,"" => 1199,"" => -5516,"" => -4384,"" => -120,"" => 1205,"" => 2323,"" => -788,"" => -202,"" => 727,"" => 649,"" => 5905,"" => 2773,"" => -1207,"" => 6620,"" => -518,"" => 551,"" => 1319,"" => 874,"" => -1350,"" => 521,"" => 1109,"" => 1591,"" => 2201,"" => 278,"" => -3794,"" => -1619,"" => -1759,"" => -2087,"" => 3815,"" => 653,"" => -758,"" => -1193,"" => 974,"" => 2742,"" => 792,"" => 1889,"" => -1368,"" => 811,"" => 4265,"" => -361,"" => -2439,"" => 4858,"" => 3593,"" => 1574,"" => -3030,"" => 755,"" => -1880,"" => 5807,"" => 3095,"" => 457,"初" => 2475,"別" => 1129,"前" => 2286,"副" => 4437,"力" => 365,"動" => -949,"務" => -1872,"化" => 1327,"" => -1038,"" => 4646,"" => -2309,"" => -783,"" => -1006,"" => 483,"" => 1233,"" => 3588,"" => -241,"" => 3906,"" => -837,"" => 4513,"" => 642,"" => 1389,"" => 1219,"" => -241,"" => 2016,"学" => -1356,"" => -423,"" => -1008,"" => 1078,"" => -513,"" => -3102,"" => 1155,"" => 3197,"" => -1804,"年" => 2416,"" => -1030,"" => 1605,"" => 1452,"" => -2352,"" => -3885,"" => 1905,"" => -1291,"性" => 1822,"戸" => -488,"指" => -3973,"政" => -2013,"教" => -1479,"" => 3222,"" => -1489,"" => 1764,"" => 2099,"" => 5792,"" => -661,"" => -1248,"" => -951,"" => -937,"" => 4125,"" => 360,"" => 3094,"" => 364,"" => -805,"" => 5156,"" => 2438,"" => 484,"" => 2613,"" => -1694,"" => -1073,"" => 1868,"" => -495,"" => 979,"" => 461,"" => -3850,"" => -273,"" => 914,"" => 1215,"" => 7313,"" => -1835,"" => 792,"" => 6293,"" => -1528,"" => 4231,"" => 401,"" => -960,"" => 1201,"" => 7767,"" => 3066,"" => 3663,"" => 1384,"" => -4229,"" => 1163,"" => 1255,"" => 6457,"" => 725,"" => -2869,"" => 785,"" => 1044,"調" => -562,"" => -733,"" => 1777,"" => 1835,"" => 1375,"" => -1504,"" => -1136,"" => -681,"" => 1026,"" => 4404,"" => 1200,"" => 2163,"" => 421,"" => -1432,"" => 1302,"" => -1282,"" => 2009,"" => -1045,"" => 2066,"" => 1620,"" => -800,"" => 2670,"" => -3794,"" => -1350,"" => 551,"グ" => 1319,"" => 874,"" => 521,"" => 1109,"" => 1591,"" => 2201,"ン" => 278},
45
+ :UW4 => {"," => 3930,"." => 3508,"" => -4841,"" => 3930,"" => 3508,"" => 4999,"" => 1895,"" => 3798,"〓" => -5156,"あ" => 4752,"い" => -3435,"う" => -640,"え" => -2514,"お" => 2405,"か" => 530,"が" => 6006,"き" => -4482,"ぎ" => -3821,"く" => -3788,"け" => -4376,"げ" => -4734,"こ" => 2255,"ご" => 1979,"さ" => 2864,"し" => -843,"じ" => -2506,"す" => -731,"ず" => 1251,"せ" => 181,"そ" => 4091,"た" => 5034,"だ" => 5408,"ち" => -3654,"っ" => -5882,"つ" => -1659,"て" => 3994,"で" => 7410,"と" => 4547,"" => 5433,"" => 6499,"" => 1853,"ね" => 1413,"の" => 7396,"は" => 8578,"ば" => 1940,"ひ" => 4249,"び" => -4134,"ふ" => 1345,"へ" => 6665,"べ" => -744,"ほ" => 1464,"ま" => 1051,"み" => -2082,"む" => -882,"め" => -5046,"も" => 4169,"ゃ" => -2666,"や" => 2795,"ょ" => -1544,"よ" => 3351,"ら" => -2922,"り" => -9726,"る" => -14896,"れ" => -2613,"ろ" => -4570,"わ" => -1783,"を" => 13150,"ん" => -2352,"" => 2145,"コ" => 1789,"セ" => 1287,"ッ" => -724,"ト" => -403,"メ" => -1635,"ラ" => -881,"リ" => -541,"ル" => -856,"ン" => -3637,"・" => -4371,"ー" => -11870,"一" => -2069,"中" => 2210,"予" => 782,"事" => -190,"" => -1768,"人" => 1036,"以" => 544,"会" => 950,"体" => -1286,"作" => 530,"側" => 4292,"先" => 601,"党" => -2006,"共" => -1212,"内" => 584,"円" => 788,"初" => 1347,"前" => 1623,"副" => 3879,"力" => -302,"動" => -740,"務" => -2715,"化" => 776,"区" => 4517,"協" => 1013,"参" => 1555,"合" => -1834,"" => -681,"員" => -910,"器" => -851,"回" => 1500,"国" => -619,"園" => -1200,"地" => 866,"場" => -1410,"塁" => -2094,"士" => -1413,"多" => 1067,"大" => 571,"子" => -4802,"学" => -1397,"定" => -1057,"" => -809,"" => 1910,"屋" => -1328,"" => -1500,"島" => -2056,"川" => -2667,"市" => 2771,"年" => 374,"" => -4556,"後" => 456,"性" => 553,"感" => 916,"所" => -1566,"" => 856,"" => 787,"政" => 2182,"教" => 704,"文" => 522,"方" => -856,"日" => 1798,"時" => 1829,"最" => 845,"月" => -9066,"" => -485,"" => -442,"" => -360,"業" => -1043,"氏" => 5388,"民" => -2716,"気" => -910,"沢" => -939,"済" => -543,"物" => -735,"率" => 672,"球" => -1267,"生" => -1286,"産" => -1101,"田" => -2900,"町" => 1826,"的" => 2586,"" => 922,"省" => -3485,"県" => 2997,"空" => -867,"" => -2112,"" => 788,"米" => 2937,"系" => 786,"約" => 2171,"経" => 1146,"統" => -1169,"" => 940,"線" => -994,"" => 749,"者" => 2145,"能" => -730,"" => -852,"" => -792,"規" => 792,"警" => -1184,"議" => -244,"" => -1000,"賞" => 730,"車" => -1481,"軍" => 1158,"輪" => -1433,"込" => -3370,"近" => 929,"道" => -1291,"選" => 2596,"郎" => -4866,"都" => 1192,"野" => -1100,"銀" => -2213,"長" => 357,"間" => -2344,"院" => -2297,"際" => -2604,"電" => -878,"領" => -1659,"題" => -792,"館" => -1984,"" => 1749,"高" => 2120,"「" => 1895,"」" => 3798,"・" => -4371,"" => -724,"" => -11870,"" => 2145,"コ" => 1789,"セ" => 1287,"ト" => -403,"メ" => -1635,"ラ" => -881,"リ" => -541,"ル" => -856,"ン" => -3637},
46
+ :UW5 => {"," => 465,"." => -299,"1" => -514,"E2" => -32768,"]" => -2762,"、" => 465,"。" => -299,"「" => 363,"あ" => 1655,"い" => 331,"う" => -503,"え" => 1199,"お" => 527,"か" => 647,"が" => -421,"き" => 1624,"ぎ" => 1971,"く" => 312,"げ" => -983,"" => -1537,"" => -1371,"す" => -852,"" => -1186,"ち" => 1093,"っ" => 52,"つ" => 921,"て" => -18,"で" => -850,"と" => -127,"ど" => 1682,"な" => -787,"に" => -1224,"の" => -635,"は" => -578,"" => 1001,"み" => 502,"め" => 865,"ゃ" => 3350,"ょ" => 854,"り" => -208,"る" => 429,"れ" => 504,"わ" => 419,"を" => -1264,"ん" => 327,"イ" => 241,"ル" => 451,"ン" => -343,"" => -871,"" => 722,"" => -1153,"" => -654,"" => 3519,"区" => -901,"告" => 848,"員" => 2104,"" => -1296,"学" => -548,"定" => 1785,"嵐" => -1304,"市" => -2991,"" => 921,"年" => 1763,"思" => 872,"所" => -814,"" => 1618,"" => -1682,"日" => 218,"月" => -4353,"査" => 932,"格" => 1356,"機" => -1508,"氏" => -1347,"田" => 240,"町" => -3912,"的" => -3149,"相" => 1319,"" => -1052,"" => -4003,"研" => -997,"社" => -278,"空" => -813,"統" => 1955,"者" => -2233,"" => 663,"語" => -1073,"議" => 1219,"選" => -1018,"郎" => -368,"長" => 786,"間" => 1191,"題" => 2368,"館" => -689,"1" => -514,"E2" => -32768,"「" => 363,"イ" => 241,"ル" => 451,"ン" => -343},
47
+ :UW6 => {"," => 227,"." => 808,"1" => -270,"E1" => 306,"、" => 227,"。" => 808,"あ" => -307,"う" => 189,"か" => 241,"が" => -73,"く" => -121,"こ" => -200,"じ" => 1782,"す" => 383,"た" => -428,"っ" => 573,"て" => -1014,"で" => 101,"と" => -105,"な" => -253,"に" => -149,"の" => -417,"は" => -236,"も" => -206,"り" => 187,"る" => -135,"を" => 195,"ル" => -673,"ン" => -496,"一" => -277,"中" => 201,"件" => -800,"会" => 624,"前" => 302,"区" => 1792,"員" => -1212,"委" => 798,"学" => -960,"市" => 887,"広" => -695,"後" => 535,"業" => -697,"相" => 753,"社" => -507,"福" => 974,"空" => -822,"者" => 1811,"連" => 463,"郎" => 1082,"1" => -270,"E1" => 306,"ル" => -673,"ン" => -496},
48
+ }
47
49
  end
48
50
 
49
51
  def score(category, pattern)
50
- instance_variable_get("@#{category.to_s}")[pattern] || 0
52
+ @models[category][pattern] || 0
51
53
  end
52
54
  end
@@ -1,3 +1,3 @@
1
1
  class TinySegmenter
2
- VERSION = "0.0.4"
2
+ VERSION = "0.0.6"
3
3
  end
@@ -7,7 +7,6 @@ require 'tiny_segmenter'
7
7
  #
8
8
  # See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
9
9
  RSpec.configure do |config|
10
- config.treat_symbols_as_metadata_keys_with_true_values = true
11
10
  config.run_all_when_everything_filtered = true
12
11
  config.filter_run :focus
13
12
  end
@@ -2,43 +2,47 @@
2
2
  require 'spec_helper'
3
3
 
4
4
  describe TinySegmenter do
5
- subject{ TinySegmenter.new }
5
+ subject{ described_class.new }
6
6
 
7
7
  describe "#segment" do
8
8
  it "tokenizes Japanese text fairly accurately" do
9
- subject.segment("極めてコンパクトな日本語分かち書きソフトウェアです。").should == \
10
- ["極めて", "コンパクト", "な", "日本", "語分", "かち", "書き", "ソフトウェア", "です", "。"]
9
+ expect(subject.segment("極めてコンパクトな日本語分かち書きソフトウェアです。")).to \
10
+ eq(["極めて", "コンパクト", "な", "日本", "語分", "かち", "書き", "ソフトウェア", "です", "。"])
11
11
  end
12
12
 
13
13
  it "removes any whitespace-only or empty tokens" do
14
- subject.segment("書かれた 極めて コンパクト").should_not include("", " ", nil)
14
+ expect(subject.segment("書かれた 極めて コンパクト")).not_to include("", " ", nil)
15
15
  end
16
16
 
17
17
  it "removes full-width space (U+3000) tokens" do
18
18
  sentence = "すてき! 男性が歌う「夢やぶれて」もいいね。"
19
19
  full_width_space = " "
20
- sentence.should include(full_width_space)
21
- subject.segment(sentence).should_not include (full_width_space)
20
+ expect(sentence).to include(full_width_space)
21
+ expect(subject.segment(sentence)).not_to include (full_width_space)
22
22
  end
23
23
 
24
24
  it "tokenizes interspersed non-Japanese words correctly" do
25
- subject.segment("TinySegmenterはRubyだけで").should == ["TinySegmenter", "は", "Ruby", "だけ", "で"]
25
+ expect(subject.segment("TinySegmenterはRubyだけで")).to \
26
+ eq(["TinySegmenter", "は", "Ruby", "だけ", "で"])
26
27
  end
27
28
 
28
29
  context "with ignore_punctuation option not set" do
29
30
  it "includes punctuation-only tokens" do
30
- subject.segment("すてき!? 男性が、歌う「夢やぶれて」もいいね。...").should include("。", "!", "?", "、", "「", "」", "...")
31
+ expect(subject.segment("すてき!? 男性が、歌う「夢やぶれて」もいいね。...")).to \
32
+ include("。", "!", "?", "、", "「", "」", "...")
31
33
  end
32
34
  end
33
35
 
34
36
  context "with ignore_punctuation option set" do
35
37
  it "removes all punctuation-only tokens" do
36
- subject.segment("すてき!? 男性が、歌う「夢やぶれて」もいいね。...", ignore_punctuation: true).should_not include("。", "!", "?", "、", "「", "」", "...")
38
+ expect(subject.segment("すてき!? 男性が、歌う「夢やぶれて」もいいね。...", ignore_punctuation: true)).not_to \
39
+ include("。", "!", "?", "、", "「", "」", "...")
37
40
  end
38
41
  end
39
42
  end
40
43
 
41
44
  it "has a version" do
42
- TinySegmenter::VERSION.should be_kind_of(String)
45
+ expect(described_class::VERSION).to be_kind_of(String)
46
+ expect(described_class::VERSION).not_to be_empty
43
47
  end
44
48
  end
@@ -5,9 +5,9 @@ require 'tiny_segmenter/version'
5
5
  Gem::Specification.new do |s|
6
6
  s.name = 'tiny_segmenter'
7
7
  s.version = TinySegmenter::VERSION
8
- s.date = '2013-03-30'
8
+ s.licenses = ['BSD']
9
9
  s.summary = "Ruby port of TinySegmenter.js for tokenizing Japanese text."
10
- s.description = "Ruby port of TinySegmenter.js for tokenizing Japanese text."
10
+ s.description = "Ruby port of TinySegmenter.js for tokenizing Japanese text. Uses a Naive Bayes model that has been trained using the RWCP corpus and optimized using L1-norm regularization. The resultant model is quite compact, yet has a 95% accuracy rate."
11
11
  s.authors = ["Peter Graham"]
12
12
  s.email = ["pete@gigadrill.com"]
13
13
  s.files = `git ls-files`.split("\n")
@@ -15,6 +15,6 @@ Gem::Specification.new do |s|
15
15
  s.require_paths = ["lib"]
16
16
  s.homepage = 'http://github.com/6/tiny_segmenter'
17
17
 
18
- s.add_development_dependency "rake"
19
- s.add_development_dependency "rspec"
18
+ s.add_development_dependency "rake", "~> 10.4"
19
+ s.add_development_dependency "rspec", "~> 3.3"
20
20
  end
metadata CHANGED
@@ -1,53 +1,55 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tiny_segmenter
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.0.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Peter Graham
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-03-30 00:00:00.000000000 Z
11
+ date: 2015-10-26 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - '>='
17
+ - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: '0'
19
+ version: '10.4'
20
20
  type: :development
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - '>='
24
+ - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: '0'
26
+ version: '10.4'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: rspec
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
- - - '>='
31
+ - - "~>"
32
32
  - !ruby/object:Gem::Version
33
- version: '0'
33
+ version: '3.3'
34
34
  type: :development
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
- - - '>='
38
+ - - "~>"
39
39
  - !ruby/object:Gem::Version
40
- version: '0'
41
- description: Ruby port of TinySegmenter.js for tokenizing Japanese text.
40
+ version: '3.3'
41
+ description: Ruby port of TinySegmenter.js for tokenizing Japanese text. Uses a Naive
42
+ Bayes model that has been trained using the RWCP corpus and optimized using L1-norm
43
+ regularization. The resultant model is quite compact, yet has a 95% accuracy rate.
42
44
  email:
43
45
  - pete@gigadrill.com
44
46
  executables: []
45
47
  extensions: []
46
48
  extra_rdoc_files: []
47
49
  files:
48
- - .gitignore
49
- - .rspec
50
- - .travis.yml
50
+ - ".gitignore"
51
+ - ".rspec"
52
+ - ".travis.yml"
51
53
  - Gemfile
52
54
  - README.md
53
55
  - Rakefile
@@ -58,7 +60,8 @@ files:
58
60
  - spec/tiny_segmenter_spec.rb
59
61
  - tiny_segmenter.gemspec
60
62
  homepage: http://github.com/6/tiny_segmenter
61
- licenses: []
63
+ licenses:
64
+ - BSD
62
65
  metadata: {}
63
66
  post_install_message:
64
67
  rdoc_options: []
@@ -66,17 +69,17 @@ require_paths:
66
69
  - lib
67
70
  required_ruby_version: !ruby/object:Gem::Requirement
68
71
  requirements:
69
- - - '>='
72
+ - - ">="
70
73
  - !ruby/object:Gem::Version
71
74
  version: '0'
72
75
  required_rubygems_version: !ruby/object:Gem::Requirement
73
76
  requirements:
74
- - - '>='
77
+ - - ">="
75
78
  - !ruby/object:Gem::Version
76
79
  version: '0'
77
80
  requirements: []
78
81
  rubyforge_project:
79
- rubygems_version: 2.0.0
82
+ rubygems_version: 2.4.5.1
80
83
  signing_key:
81
84
  specification_version: 4
82
85
  summary: Ruby port of TinySegmenter.js for tokenizing Japanese text.