tiny_segmenter 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,2 @@
1
+ *.gem
2
+ .DS_Store
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --format progress
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in method_decorators.gemspec
4
+ gemspec
data/README.md ADDED
@@ -0,0 +1,23 @@
1
+ Ruby port of [TinySegmenter.js](http://chasen.org/~taku/software/TinySegmenter/) for tokenizing Japanese text.
2
+
3
+ ### Install
4
+
5
+ `gem install tiny_segmenter` or add `tiny_segmenter` to your `Gemfile`
6
+
7
+ ### Usage
8
+
9
+ ```ruby
10
+ ts = TinySegmenter.new
11
+ p ts.segment("今晩は!良い天気ですね")
12
+ # => ["今晩", "は", "!", "良い", "天気", "です", "ね"]
13
+ ```
14
+
15
+ Input text should be UTF-8 encoded.
16
+
17
+ ### How it works
18
+
19
+ The Naive Bayes model was trained using the [RWCP corpus](http://research.nii.ac.jp/src/list.html) and optimized using L1-norm regularization (e.g. [this](https://research.microsoft.com/pubs/78900/andrew07scalable.pdf)). The resultant model is quite compact, yet (according to the author) has about a 95% accuracy rate.
20
+
21
+ ### License
22
+
23
+ BSD - see http://chasen.org/~taku/software/TinySegmenter/LICENCE.txt
@@ -1,10 +1,12 @@
1
1
  #encoding: utf-8
2
+ require "tiny_segmenter/version"
3
+ require "tiny_segmenter/segmentation_model"
2
4
 
3
5
  class TinySegmenter
4
6
  def initialize
5
7
  @chartype = []
8
+ @model = SegmentationModel.new
6
9
  @BIAS = -332
7
- set_score_hashes
8
10
  # Compile regex patterns
9
11
  {
10
12
  "[一二三四五六七八九十百千万億兆]" => "M", # numbers (japanese)
@@ -70,67 +72,22 @@ class TinySegmenter
70
72
  def sum_scores(p1, p2, p3, w1, w2, w3, w4, w5, w6, c1, c2, c3, c4, c5, c6)
71
73
  score = 0
72
74
  [
73
- [@UP1, p1], [@UP2, p2], [@UP3, p3],
74
- [@BP1, p1, p2], [@BP2, p2, p3],
75
- [@UW1, w1], [@UW2, w2], [@UW3, w3], [@UW4, w4], [@UW5, w5], [@UW6, w6],
76
- [@BW1, w2, w3], [@BW2, w3, w4], [@BW3, w4, w5],
77
- [@TW1, w1, w2, w3], [@TW2, w2, w3, w4], [@TW3, w3, w4, w5], [@TW4, w4, w5, w6],
78
- [@UC1, c1], [@UC2, c2], [@UC3, c3], [@UC4, c4], [@UC5, c5], [@UC6, c6],
79
- [@BC1, c2, c3], [@BC2, c3, c4], [@BC3, c4, c5],
80
- [@TC1, c1, c2, c3], [@TC2, c2, c3, c4], [@TC3, c3, c4, c5], [@TC4, c4, c5, c6],
81
- [@UQ1, p1, c1], [@UQ2, p2, c2], [@UQ3, p3, c3],
82
- [@BQ1, p2, c2, c3], [@BQ2, p2, c3, c4], [@BQ3, p3, c2, c3], [@BQ4, p3, c3, c4],
83
- [@TQ1, p2, c1, c2, c3], [@TQ2, p2, c2, c3, c4], [@TQ3, p3, c1, c2, c3], [@TQ4, p3, c2, c3, c4],
84
- ].each do |score_hash_and_key|
85
- score_hash = score_hash_and_key[0]
86
- key = score_hash_and_key[1..-1].join("")
87
- score += score_hash[key] || 0
75
+ [:UP1, p1], [:UP2, p2], [:UP3, p3],
76
+ [:BP1, p1, p2], [:BP2, p2, p3],
77
+ [:UW1, w1], [:UW2, w2], [:UW3, w3], [:UW4, w4], [:UW5, w5], [:UW6, w6],
78
+ [:BW1, w2, w3], [:BW2, w3, w4], [:BW3, w4, w5],
79
+ [:TW1, w1, w2, w3], [:TW2, w2, w3, w4], [:TW3, w3, w4, w5], [:TW4, w4, w5, w6],
80
+ [:UC1, c1], [:UC2, c2], [:UC3, c3], [:UC4, c4], [:UC5, c5], [:UC6, c6],
81
+ [:BC1, c2, c3], [:BC2, c3, c4], [:BC3, c4, c5],
82
+ [:TC1, c1, c2, c3], [:TC2, c2, c3, c4], [:TC3, c3, c4, c5], [:TC4, c4, c5, c6],
83
+ [:UQ1, p1, c1], [:UQ2, p2, c2], [:UQ3, p3, c3],
84
+ [:BQ1, p2, c2, c3], [:BQ2, p2, c3, c4], [:BQ3, p3, c2, c3], [:BQ4, p3, c3, c4],
85
+ [:TQ1, p2, c1, c2, c3], [:TQ2, p2, c2, c3, c4], [:TQ3, p3, c1, c2, c3], [:TQ4, p3, c2, c3, c4],
86
+ ].each do |category_and_pattern|
87
+ category = category_and_pattern[0]
88
+ pattern = category_and_pattern[1..-1].join("")
89
+ score += @model.score(category, pattern)
88
90
  end
89
91
  score
90
92
  end
91
-
92
- def set_score_hashes
93
- @BC1 = {"HH" => 6,"II" => 2461,"KH" => 406,"OH" => -1378}
94
- @BC2 = {"AA" => -3267,"AI" => 2744,"AN" => -878,"HH" => -4070,"HM" => -1711,"HN" => 4012,"HO" => 3761,"IA" => 1327,"IH" => -1184,"II" => -1332,"IK" => 1721,"IO" => 5492,"KI" => 3831,"KK" => -8741,"MH" => -3132,"MK" => 3334,"OO" => -2920}
95
- @BC3 = {"HH" => 996,"HI" => 626,"HK" => -721,"HN" => -1307,"HO" => -836,"IH" => -301,"KK" => 2762,"MK" => 1079,"MM" => 4034,"OA" => -1652,"OH" => 266}
96
- @BP1 = {"BB" => 295,"OB" => 304,"OO" => -125,"UB" => 352}
97
- @BP2 = {"BO" => 60,"OO" => -1762}
98
- @BQ1 = {"BHH" => 1150,"BHM" => 1521,"BII" => -1158,"BIM" => 886,"BMH" => 1208,"BNH" => 449,"BOH" => -91,"BOO" => -2597,"OHI" => 451,"OIH" => -296,"OKA" => 1851,"OKH" => -1020,"OKK" => 904,"OOO" => 2965}
99
- @BQ2 = {"BHH" => 118,"BHI" => -1159,"BHM" => 466,"BIH" => -919,"BKK" => -1720,"BKO" => 864,"OHH" => -1139,"OHM" => -181,"OIH" => 153,"UHI" => -1146}
100
- @BQ3 = {"BHH" => -792,"BHI" => 2664,"BII" => -299,"BKI" => 419,"BMH" => 937,"BMM" => 8335,"BNN" => 998,"BOH" => 775,"OHH" => 2174,"OHM" => 439,"OII" => 280,"OKH" => 1798,"OKI" => -793,"OKO" => -2242,"OMH" => -2402,"OOO" => 11699}
101
- @BQ4 = {"BHH" => -3895,"BIH" => 3761,"BII" => -4654,"BIK" => 1348,"BKK" => -1806,"BMI" => -3385,"BOO" => -12396,"OAH" => 926,"OHH" => 266,"OHK" => -2036,"ONN" => -973}
102
- @BW1 = {",と" => 660,",同" => 727,"B1あ" => 1404,"B1同" => 542,"、と" => 660,"、同" => 727,"」と" => 1682,"あっ" => 1505,"いう" => 1743,"いっ" => -2055,"いる" => 672,"うし" => -4817,"うん" => 665,"から" => 3472,"がら" => 600,"こう" => -790,"こと" => 2083,"こん" => -1262,"さら" => -4143,"さん" => 4573,"した" => 2641,"して" => 1104,"すで" => -3399,"そこ" => 1977,"それ" => -871,"たち" => 1122,"ため" => 601,"った" => 3463,"つい" => -802,"てい" => 805,"てき" => 1249,"でき" => 1127,"です" => 3445,"では" => 844,"とい" => -4915,"とみ" => 1922,"どこ" => 3887,"ない" => 5713,"なっ" => 3015,"など" => 7379,"なん" => -1113,"にし" => 2468,"には" => 1498,"にも" => 1671,"に対" => -912,"の一" => -501,"の中" => 741,"ませ" => 2448,"まで" => 1711,"まま" => 2600,"まる" => -2155,"やむ" => -1947,"よっ" => -2565,"れた" => 2369,"れで" => -913,"をし" => 1860,"を見" => 731,"亡く" => -1886,"京都" => 2558,"取り" => -2784,"大き" => -2604,"大阪" => 1497,"平方" => -2314,"引き" => -1336,"日本" => -195,"本当" => -2423,"毎日" => -2113,"目指" => -724,"B1あ" => 1404,"B1同" => 542,"」と" => 1682}
103
- @BW2 = {".." => -11822,"11" => -669,"――" => -5730,"−−" => -13175,"いう" => -1609,"うか" => 2490,"かし" => -1350,"かも" => -602,"から" => -7194,"かれ" => 4612,"がい" => 853,"がら" => -3198,"きた" => 1941,"くな" => -1597,"こと" => -8392,"この" => -4193,"させ" => 4533,"され" => 13168,"さん" => -3977,"しい" => -1819,"しか" => -545,"した" => 5078,"して" => 972,"しな" => 939,"その" => -3744,"たい" => -1253,"たた" => -662,"ただ" => -3857,"たち" => -786,"たと" => 1224,"たは" => -939,"った" => 4589,"って" => 1647,"っと" => -2094,"てい" => 6144,"てき" => 3640,"てく" => 2551,"ては" => -3110,"ても" => -3065,"でい" => 2666,"でき" => -1528,"でし" => -3828,"です" => -4761,"でも" => -4203,"とい" => 1890,"とこ" => -1746,"とと" => -2279,"との" => 720,"とみ" => 5168,"とも" => -3941,"ない" => -2488,"なが" => -1313,"など" => -6509,"なの" => 2614,"なん" => 3099,"にお" => -1615,"にし" => 2748,"にな" => 2454,"によ" => -7236,"に対" => -14943,"に従" => -4688,"に関" => -11388,"のか" => 2093,"ので" => -7059,"のに" => -6041,"のの" => -6125,"はい" => 1073,"はが" => -1033,"はず" => -2532,"ばれ" => 1813,"まし" => -1316,"まで" => -6621,"まれ" => 5409,"めて" => -3153,"もい" => 2230,"もの" => -10713,"らか" => -944,"らし" => -1611,"らに" => -1897,"りし" => 651,"りま" => 1620,"れた" => 4270,"れて" => 849,"れば" => 4114,"ろう" => 6067,"われ" => 7901,"を通" => -11877,"んだ" => 728,"んな" => -4115,"一人" => 602,"一方" => -1375,"一日" => 970,"一部" => -1051,"上が" => -4479,"会社" => -1116,"出て" => 2163,"分の" => -7758,"同党" => 970,"同日" => -913,"大阪" => -2471,"委員" => -1250,"少な" => -1050,"年度" => -8669,"年間" => -1626,"府県" => -2363,"手権" => -1982,"新聞" => -4066,"日新" => -722,"日本" => -7068,"日米" => 3372,"曜日" => -601,"朝鮮" => -2355,"本人" => -2697,"東京" => -1543,"然と" => -1384,"社会" => -1276,"立て" => -990,"第に" => -1612,"米国" => -4268,"11" => -669}
104
- @BW3 = {"あた" => -2194,"あり" => 719,"ある" => 3846,"い." => -1185,"い。" => -1185,"いい" => 5308,"いえ" => 2079,"いく" => 3029,"いた" => 2056,"いっ" => 1883,"いる" => 5600,"いわ" => 1527,"うち" => 1117,"うと" => 4798,"えと" => 1454,"か." => 2857,"か。" => 2857,"かけ" => -743,"かっ" => -4098,"かに" => -669,"から" => 6520,"かり" => -2670,"が," => 1816,"が、" => 1816,"がき" => -4855,"がけ" => -1127,"がっ" => -913,"がら" => -4977,"がり" => -2064,"きた" => 1645,"けど" => 1374,"こと" => 7397,"この" => 1542,"ころ" => -2757,"さい" => -714,"さを" => 976,"し," => 1557,"し、" => 1557,"しい" => -3714,"した" => 3562,"して" => 1449,"しな" => 2608,"しま" => 1200,"す." => -1310,"す。" => -1310,"する" => 6521,"ず," => 3426,"ず、" => 3426,"ずに" => 841,"そう" => 428,"た." => 8875,"た。" => 8875,"たい" => -594,"たの" => 812,"たり" => -1183,"たる" => -853,"だ." => 4098,"だ。" => 4098,"だっ" => 1004,"った" => -4748,"って" => 300,"てい" => 6240,"てお" => 855,"ても" => 302,"です" => 1437,"でに" => -1482,"では" => 2295,"とう" => -1387,"とし" => 2266,"との" => 541,"とも" => -3543,"どう" => 4664,"ない" => 1796,"なく" => -903,"など" => 2135,"に," => -1021,"に、" => -1021,"にし" => 1771,"にな" => 1906,"には" => 2644,"の," => -724,"の、" => -724,"の子" => -1000,"は," => 1337,"は、" => 1337,"べき" => 2181,"まし" => 1113,"ます" => 6943,"まっ" => -1549,"まで" => 6154,"まれ" => -793,"らし" => 1479,"られ" => 6820,"るる" => 3818,"れ," => 854,"れ、" => 854,"れた" => 1850,"れて" => 1375,"れば" => -3246,"れる" => 1091,"われ" => -605,"んだ" => 606,"んで" => 798,"カ月" => 990,"会議" => 860,"入り" => 1232,"大会" => 2217,"始め" => 1681,"市" => 965,"新聞" => -5055,"日," => 974,"日、" => 974,"社会" => 2024,"カ月" => 990}
105
- @TC1 = {"AAA" => 1093,"HHH" => 1029,"HHM" => 580,"HII" => 998,"HOH" => -390,"HOM" => -331,"IHI" => 1169,"IOH" => -142,"IOI" => -1015,"IOM" => 467,"MMH" => 187,"OOI" => -1832}
106
- @TC2 = {"HHO" => 2088,"HII" => -1023,"HMM" => -1154,"IHI" => -1965,"KKH" => 703,"OII" => -2649}
107
- @TC3 = {"AAA" => -294,"HHH" => 346,"HHI" => -341,"HII" => -1088,"HIK" => 731,"HOH" => -1486,"IHH" => 128,"IHI" => -3041,"IHO" => -1935,"IIH" => -825,"IIM" => -1035,"IOI" => -542,"KHH" => -1216,"KKA" => 491,"KKH" => -1217,"KOK" => -1009,"MHH" => -2694,"MHM" => -457,"MHO" => 123,"MMH" => -471,"NNH" => -1689,"NNO" => 662,"OHO" => -3393}
108
- @TC4 = {"HHH" => -203,"HHI" => 1344,"HHK" => 365,"HHM" => -122,"HHN" => 182,"HHO" => 669,"HIH" => 804,"HII" => 679,"HOH" => 446,"IHH" => 695,"IHO" => -2324,"IIH" => 321,"III" => 1497,"IIO" => 656,"IOO" => 54,"KAK" => 4845,"KKA" => 3386,"KKK" => 3065,"MHH" => -405,"MHI" => 201,"MMH" => -241,"MMM" => 661,"MOM" => 841}
109
- @TQ1 = {"BHHH" => -227,"BHHI" => 316,"BHIH" => -132,"BIHH" => 60,"BIII" => 1595,"BNHH" => -744,"BOHH" => 225,"BOOO" => -908,"OAKK" => 482,"OHHH" => 281,"OHIH" => 249,"OIHI" => 200,"OIIH" => -68}
110
- @TQ2 = {"BIHH" => -1401,"BIII" => -1033,"BKAK" => -543,"BOOO" => -5591}
111
- @TQ3 = {"BHHH" => 478,"BHHM" => -1073,"BHIH" => 222,"BHII" => -504,"BIIH" => -116,"BIII" => -105,"BMHI" => -863,"BMHM" => -464,"BOMH" => 620,"OHHH" => 346,"OHHI" => 1729,"OHII" => 997,"OHMH" => 481,"OIHH" => 623,"OIIH" => 1344,"OKAK" => 2792,"OKHH" => 587,"OKKA" => 679,"OOHH" => 110,"OOII" => -685}
112
- @TQ4 = {"BHHH" => -721,"BHHM" => -3604,"BHII" => -966,"BIIH" => -607,"BIII" => -2181,"OAAA" => -2763,"OAKK" => 180,"OHHH" => -294,"OHHI" => 2446,"OHHO" => 480,"OHIH" => -1573,"OIHH" => 1935,"OIHI" => -493,"OIIH" => 626,"OIII" => -4007,"OKAK" => -8156}
113
- @TW1 = {"につい" => -4681,"東京都" => 2026}
114
- @TW2 = {"ある程" => -2049,"いった" => -1256,"ころが" => -2434,"しょう" => 3873,"その後" => -4430,"だって" => -1049,"ていた" => 1833,"として" => -4657,"ともに" => -4517,"もので" => 1882,"一気に" => -792,"初めて" => -1512,"同時に" => -8097,"大きな" => -1255,"対して" => -2721,"社会党" => -3216}
115
- @TW3 = {"いただ" => -1734,"してい" => 1314,"として" => -4314,"につい" => -5483,"にとっ" => -5989,"に当た" => -6247,"ので," => -727,"ので、" => -727,"のもの" => -600,"れから" => -3752,"十二月" => -2287}
116
- @TW4 = {"いう." => 8576,"いう。" => 8576,"からな" => -2348,"してい" => 2958,"たが," => 1516,"たが、" => 1516,"ている" => 1538,"という" => 1349,"ました" => 5543,"ません" => 1097,"ようと" => -4258,"よると" => 5865}
117
- @UC1 = {"A" => 484,"K" => 93,"M" => 645,"O" => -505}
118
- @UC2 = {"A" => 819,"H" => 1059,"I" => 409,"M" => 3987,"N" => 5775,"O" => 646}
119
- @UC3 = {"A" => -1370,"I" => 2311}
120
- @UC4 = {"A" => -2643,"H" => 1809,"I" => -1032,"K" => -3450,"M" => 3565,"N" => 3876,"O" => 6646}
121
- @UC5 = {"H" => 313,"I" => -1238,"K" => -799,"M" => 539,"O" => -831}
122
- @UC6 = {"H" => -506,"I" => -253,"K" => 87,"M" => 247,"O" => -387}
123
- @UP1 = {"O" => -214}
124
- @UP2 = {"B" => 69,"O" => 935}
125
- @UP3 = {"B" => 189}
126
- @UQ1 = {"BH" => 21,"BI" => -12,"BK" => -99,"BN" => 142,"BO" => -56,"OH" => -95,"OI" => 477,"OK" => 410,"OO" => -2422}
127
- @UQ2 = {"BH" => 216,"BI" => 113,"OK" => 1759}
128
- @UQ3 = {"BA" => -479,"BH" => 42,"BI" => 1913,"BK" => -7198,"BM" => 3160,"BN" => 6427,"BO" => 14761,"OI" => -827,"ON" => -3212}
129
- @UW1 = {"," => 156,"、" => 156,"「" => -463,"あ" => -941,"う" => -127,"が" => -553,"き" => 121,"こ" => 505,"で" => -201,"と" => -547,"ど" => -123,"に" => -789,"の" => -185,"は" => -847,"も" => -466,"や" => -470,"よ" => 182,"ら" => -292,"り" => 208,"れ" => 169,"を" => -446,"ん" => -137,"・" => -135,"主" => -402,"京" => -268,"区" => -912,"午" => 871,"国" => -460,"大" => 561,"委" => 729,"市" => -411,"日" => -141,"理" => 361,"生" => -408,"県" => -386,"都" => -718,"「" => -463,"・" => -135}
130
- @UW2 = {"," => -829,"、" => -829,"〇" => 892,"「" => -645,"」" => 3145,"あ" => -538,"い" => 505,"う" => 134,"お" => -502,"か" => 1454,"が" => -856,"く" => -412,"こ" => 1141,"さ" => 878,"ざ" => 540,"し" => 1529,"す" => -675,"せ" => 300,"そ" => -1011,"た" => 188,"だ" => 1837,"つ" => -949,"て" => -291,"で" => -268,"と" => -981,"ど" => 1273,"な" => 1063,"に" => -1764,"の" => 130,"は" => -409,"ひ" => -1273,"べ" => 1261,"ま" => 600,"も" => -1263,"や" => -402,"よ" => 1639,"り" => -579,"る" => -694,"れ" => 571,"を" => -2516,"ん" => 2095,"ア" => -587,"カ" => 306,"キ" => 568,"ッ" => 831,"三" => -758,"不" => -2150,"世" => -302,"中" => -968,"主" => -861,"事" => 492,"人" => -123,"会" => 978,"保" => 362,"入" => 548,"初" => -3025,"副" => -1566,"北" => -3414,"区" => -422,"大" => -1769,"天" => -865,"太" => -483,"子" => -1519,"学" => 760,"実" => 1023,"小" => -2009,"市" => -813,"年" => -1060,"強" => 1067,"手" => -1519,"揺" => -1033,"政" => 1522,"文" => -1355,"新" => -1682,"日" => -1815,"明" => -1462,"最" => -630,"朝" => -1843,"本" => -1650,"東" => -931,"果" => -665,"次" => -2378,"民" => -180,"気" => -1740,"理" => 752,"発" => 529,"目" => -1584,"相" => -242,"県" => -1165,"立" => -763,"第" => 810,"米" => 509,"自" => -1353,"行" => 838,"西" => -744,"見" => -3874,"調" => 1010,"議" => 1198,"込" => 3041,"開" => 1758,"間" => -1257,"「" => -645,"」" => 3145,"ッ" => 831,"ア" => -587,"カ" => 306,"キ" => 568}
131
- @UW3 = {"," => 4889,"1" => -800,"−" => -1723,"、" => 4889,"々" => -2311,"〇" => 5827,"」" => 2670,"〓" => -3573,"あ" => -2696,"い" => 1006,"う" => 2342,"え" => 1983,"お" => -4864,"か" => -1163,"が" => 3271,"く" => 1004,"け" => 388,"げ" => 401,"こ" => -3552,"ご" => -3116,"さ" => -1058,"し" => -395,"す" => 584,"せ" => 3685,"そ" => -5228,"た" => 842,"ち" => -521,"っ" => -1444,"つ" => -1081,"て" => 6167,"で" => 2318,"と" => 1691,"ど" => -899,"な" => -2788,"に" => 2745,"の" => 4056,"は" => 4555,"ひ" => -2171,"ふ" => -1798,"へ" => 1199,"ほ" => -5516,"ま" => -4384,"み" => -120,"め" => 1205,"も" => 2323,"や" => -788,"よ" => -202,"ら" => 727,"り" => 649,"る" => 5905,"れ" => 2773,"わ" => -1207,"を" => 6620,"ん" => -518,"ア" => 551,"グ" => 1319,"ス" => 874,"ッ" => -1350,"ト" => 521,"ム" => 1109,"ル" => 1591,"ロ" => 2201,"ン" => 278,"・" => -3794,"一" => -1619,"下" => -1759,"世" => -2087,"両" => 3815,"中" => 653,"主" => -758,"予" => -1193,"二" => 974,"人" => 2742,"今" => 792,"他" => 1889,"以" => -1368,"低" => 811,"何" => 4265,"作" => -361,"保" => -2439,"元" => 4858,"党" => 3593,"全" => 1574,"公" => -3030,"六" => 755,"共" => -1880,"円" => 5807,"再" => 3095,"分" => 457,"初" => 2475,"別" => 1129,"前" => 2286,"副" => 4437,"力" => 365,"動" => -949,"務" => -1872,"化" => 1327,"北" => -1038,"区" => 4646,"千" => -2309,"午" => -783,"協" => -1006,"口" => 483,"右" => 1233,"各" => 3588,"合" => -241,"同" => 3906,"和" => -837,"員" => 4513,"国" => 642,"型" => 1389,"場" => 1219,"外" => -241,"妻" => 2016,"学" => -1356,"安" => -423,"実" => -1008,"家" => 1078,"小" => -513,"少" => -3102,"州" => 1155,"市" => 3197,"平" => -1804,"年" => 2416,"広" => -1030,"府" => 1605,"度" => 1452,"建" => -2352,"当" => -3885,"得" => 1905,"思" => -1291,"性" => 1822,"戸" => -488,"指" => -3973,"政" => -2013,"教" => -1479,"数" => 3222,"文" => -1489,"新" => 1764,"日" => 2099,"旧" => 5792,"昨" => -661,"時" => -1248,"曜" => -951,"最" => -937,"月" => 4125,"期" => 360,"李" => 3094,"村" => 364,"東" => -805,"核" => 5156,"森" => 2438,"業" => 484,"氏" => 2613,"民" => -1694,"決" => -1073,"法" => 1868,"海" => -495,"無" => 979,"物" => 461,"特" => -3850,"生" => -273,"用" => 914,"町" => 1215,"的" => 7313,"直" => -1835,"省" => 792,"県" => 6293,"知" => -1528,"私" => 4231,"税" => 401,"立" => -960,"第" => 1201,"米" => 7767,"系" => 3066,"約" => 3663,"級" => 1384,"統" => -4229,"総" => 1163,"線" => 1255,"者" => 6457,"能" => 725,"自" => -2869,"英" => 785,"見" => 1044,"調" => -562,"財" => -733,"費" => 1777,"車" => 1835,"軍" => 1375,"込" => -1504,"通" => -1136,"選" => -681,"郎" => 1026,"郡" => 4404,"部" => 1200,"金" => 2163,"長" => 421,"開" => -1432,"間" => 1302,"関" => -1282,"雨" => 2009,"電" => -1045,"非" => 2066,"駅" => 1620,"1" => -800,"」" => 2670,"・" => -3794,"ッ" => -1350,"ア" => 551,"グ" => 1319,"ス" => 874,"ト" => 521,"ム" => 1109,"ル" => 1591,"ロ" => 2201,"ン" => 278}
132
- @UW4 = {"," => 3930,"." => 3508,"―" => -4841,"、" => 3930,"。" => 3508,"〇" => 4999,"「" => 1895,"」" => 3798,"〓" => -5156,"あ" => 4752,"い" => -3435,"う" => -640,"え" => -2514,"お" => 2405,"か" => 530,"が" => 6006,"き" => -4482,"ぎ" => -3821,"く" => -3788,"け" => -4376,"げ" => -4734,"こ" => 2255,"ご" => 1979,"さ" => 2864,"し" => -843,"じ" => -2506,"す" => -731,"ず" => 1251,"せ" => 181,"そ" => 4091,"た" => 5034,"だ" => 5408,"ち" => -3654,"っ" => -5882,"つ" => -1659,"て" => 3994,"で" => 7410,"と" => 4547,"な" => 5433,"に" => 6499,"ぬ" => 1853,"ね" => 1413,"の" => 7396,"は" => 8578,"ば" => 1940,"ひ" => 4249,"び" => -4134,"ふ" => 1345,"へ" => 6665,"べ" => -744,"ほ" => 1464,"ま" => 1051,"み" => -2082,"む" => -882,"め" => -5046,"も" => 4169,"ゃ" => -2666,"や" => 2795,"ょ" => -1544,"よ" => 3351,"ら" => -2922,"り" => -9726,"る" => -14896,"れ" => -2613,"ろ" => -4570,"わ" => -1783,"を" => 13150,"ん" => -2352,"カ" => 2145,"コ" => 1789,"セ" => 1287,"ッ" => -724,"ト" => -403,"メ" => -1635,"ラ" => -881,"リ" => -541,"ル" => -856,"ン" => -3637,"・" => -4371,"ー" => -11870,"一" => -2069,"中" => 2210,"予" => 782,"事" => -190,"井" => -1768,"人" => 1036,"以" => 544,"会" => 950,"体" => -1286,"作" => 530,"側" => 4292,"先" => 601,"党" => -2006,"共" => -1212,"内" => 584,"円" => 788,"初" => 1347,"前" => 1623,"副" => 3879,"力" => -302,"動" => -740,"務" => -2715,"化" => 776,"区" => 4517,"協" => 1013,"参" => 1555,"合" => -1834,"和" => -681,"員" => -910,"器" => -851,"回" => 1500,"国" => -619,"園" => -1200,"地" => 866,"場" => -1410,"塁" => -2094,"士" => -1413,"多" => 1067,"大" => 571,"子" => -4802,"学" => -1397,"定" => -1057,"寺" => -809,"小" => 1910,"屋" => -1328,"山" => -1500,"島" => -2056,"川" => -2667,"市" => 2771,"年" => 374,"庁" => -4556,"後" => 456,"性" => 553,"感" => 916,"所" => -1566,"支" => 856,"改" => 787,"政" => 2182,"教" => 704,"文" => 522,"方" => -856,"日" => 1798,"時" => 1829,"最" => 845,"月" => -9066,"木" => -485,"来" => -442,"校" => -360,"業" => -1043,"氏" => 5388,"民" => -2716,"気" => -910,"沢" => -939,"済" => -543,"物" => -735,"率" => 672,"球" => -1267,"生" => -1286,"産" => -1101,"田" => -2900,"町" => 1826,"的" => 2586,"目" => 922,"省" => -3485,"県" => 2997,"空" => -867,"立" => -2112,"第" => 788,"米" => 2937,"系" => 786,"約" => 2171,"経" => 1146,"統" => -1169,"総" => 940,"線" => -994,"署" => 749,"者" => 2145,"能" => -730,"般" => -852,"行" => -792,"規" => 792,"警" => -1184,"議" => -244,"谷" => -1000,"賞" => 730,"車" => -1481,"軍" => 1158,"輪" => -1433,"込" => -3370,"近" => 929,"道" => -1291,"選" => 2596,"郎" => -4866,"都" => 1192,"野" => -1100,"銀" => -2213,"長" => 357,"間" => -2344,"院" => -2297,"際" => -2604,"電" => -878,"領" => -1659,"題" => -792,"館" => -1984,"首" => 1749,"高" => 2120,"「" => 1895,"」" => 3798,"・" => -4371,"ッ" => -724,"ー" => -11870,"カ" => 2145,"コ" => 1789,"セ" => 1287,"ト" => -403,"メ" => -1635,"ラ" => -881,"リ" => -541,"ル" => -856,"ン" => -3637}
133
- @UW5 = {"," => 465,"." => -299,"1" => -514,"E2" => -32768,"]" => -2762,"、" => 465,"。" => -299,"「" => 363,"あ" => 1655,"い" => 331,"う" => -503,"え" => 1199,"お" => 527,"か" => 647,"が" => -421,"き" => 1624,"ぎ" => 1971,"く" => 312,"げ" => -983,"さ" => -1537,"し" => -1371,"す" => -852,"だ" => -1186,"ち" => 1093,"っ" => 52,"つ" => 921,"て" => -18,"で" => -850,"と" => -127,"ど" => 1682,"な" => -787,"に" => -1224,"の" => -635,"は" => -578,"べ" => 1001,"み" => 502,"め" => 865,"ゃ" => 3350,"ょ" => 854,"り" => -208,"る" => 429,"れ" => 504,"わ" => 419,"を" => -1264,"ん" => 327,"イ" => 241,"ル" => 451,"ン" => -343,"中" => -871,"京" => 722,"会" => -1153,"党" => -654,"務" => 3519,"区" => -901,"告" => 848,"員" => 2104,"大" => -1296,"学" => -548,"定" => 1785,"嵐" => -1304,"市" => -2991,"席" => 921,"年" => 1763,"思" => 872,"所" => -814,"挙" => 1618,"新" => -1682,"日" => 218,"月" => -4353,"査" => 932,"格" => 1356,"機" => -1508,"氏" => -1347,"田" => 240,"町" => -3912,"的" => -3149,"相" => 1319,"省" => -1052,"県" => -4003,"研" => -997,"社" => -278,"空" => -813,"統" => 1955,"者" => -2233,"表" => 663,"語" => -1073,"議" => 1219,"選" => -1018,"郎" => -368,"長" => 786,"間" => 1191,"題" => 2368,"館" => -689,"1" => -514,"E2" => -32768,"「" => 363,"イ" => 241,"ル" => 451,"ン" => -343}
134
- @UW6 = {"," => 227,"." => 808,"1" => -270,"E1" => 306,"、" => 227,"。" => 808,"あ" => -307,"う" => 189,"か" => 241,"が" => -73,"く" => -121,"こ" => -200,"じ" => 1782,"す" => 383,"た" => -428,"っ" => 573,"て" => -1014,"で" => 101,"と" => -105,"な" => -253,"に" => -149,"の" => -417,"は" => -236,"も" => -206,"り" => 187,"る" => -135,"を" => 195,"ル" => -673,"ン" => -496,"一" => -277,"中" => 201,"件" => -800,"会" => 624,"前" => 302,"区" => 1792,"員" => -1212,"委" => 798,"学" => -960,"市" => 887,"広" => -695,"後" => 535,"業" => -697,"相" => 753,"社" => -507,"福" => 974,"空" => -822,"者" => 1811,"連" => 463,"郎" => 1082,"1" => -270,"E1" => 306,"ル" => -673,"ン" => -496}
135
- end
136
93
  end
@@ -0,0 +1,52 @@
1
+ #encoding: utf-8
2
+
3
+ class SegmentationModel
4
+ def initialize
5
+ @BC1 = {"HH" => 6,"II" => 2461,"KH" => 406,"OH" => -1378}
6
+ @BC2 = {"AA" => -3267,"AI" => 2744,"AN" => -878,"HH" => -4070,"HM" => -1711,"HN" => 4012,"HO" => 3761,"IA" => 1327,"IH" => -1184,"II" => -1332,"IK" => 1721,"IO" => 5492,"KI" => 3831,"KK" => -8741,"MH" => -3132,"MK" => 3334,"OO" => -2920}
7
+ @BC3 = {"HH" => 996,"HI" => 626,"HK" => -721,"HN" => -1307,"HO" => -836,"IH" => -301,"KK" => 2762,"MK" => 1079,"MM" => 4034,"OA" => -1652,"OH" => 266}
8
+ @BP1 = {"BB" => 295,"OB" => 304,"OO" => -125,"UB" => 352}
9
+ @BP2 = {"BO" => 60,"OO" => -1762}
10
+ @BQ1 = {"BHH" => 1150,"BHM" => 1521,"BII" => -1158,"BIM" => 886,"BMH" => 1208,"BNH" => 449,"BOH" => -91,"BOO" => -2597,"OHI" => 451,"OIH" => -296,"OKA" => 1851,"OKH" => -1020,"OKK" => 904,"OOO" => 2965}
11
+ @BQ2 = {"BHH" => 118,"BHI" => -1159,"BHM" => 466,"BIH" => -919,"BKK" => -1720,"BKO" => 864,"OHH" => -1139,"OHM" => -181,"OIH" => 153,"UHI" => -1146}
12
+ @BQ3 = {"BHH" => -792,"BHI" => 2664,"BII" => -299,"BKI" => 419,"BMH" => 937,"BMM" => 8335,"BNN" => 998,"BOH" => 775,"OHH" => 2174,"OHM" => 439,"OII" => 280,"OKH" => 1798,"OKI" => -793,"OKO" => -2242,"OMH" => -2402,"OOO" => 11699}
13
+ @BQ4 = {"BHH" => -3895,"BIH" => 3761,"BII" => -4654,"BIK" => 1348,"BKK" => -1806,"BMI" => -3385,"BOO" => -12396,"OAH" => 926,"OHH" => 266,"OHK" => -2036,"ONN" => -973}
14
+ @BW1 = {",と" => 660,",同" => 727,"B1あ" => 1404,"B1同" => 542,"、と" => 660,"、同" => 727,"」と" => 1682,"あっ" => 1505,"いう" => 1743,"いっ" => -2055,"いる" => 672,"うし" => -4817,"うん" => 665,"から" => 3472,"がら" => 600,"こう" => -790,"こと" => 2083,"こん" => -1262,"さら" => -4143,"さん" => 4573,"した" => 2641,"して" => 1104,"すで" => -3399,"そこ" => 1977,"それ" => -871,"たち" => 1122,"ため" => 601,"った" => 3463,"つい" => -802,"てい" => 805,"てき" => 1249,"でき" => 1127,"です" => 3445,"では" => 844,"とい" => -4915,"とみ" => 1922,"どこ" => 3887,"ない" => 5713,"なっ" => 3015,"など" => 7379,"なん" => -1113,"にし" => 2468,"には" => 1498,"にも" => 1671,"に対" => -912,"の一" => -501,"の中" => 741,"ませ" => 2448,"まで" => 1711,"まま" => 2600,"まる" => -2155,"やむ" => -1947,"よっ" => -2565,"れた" => 2369,"れで" => -913,"をし" => 1860,"を見" => 731,"亡く" => -1886,"京都" => 2558,"取り" => -2784,"大き" => -2604,"大阪" => 1497,"平方" => -2314,"引き" => -1336,"日本" => -195,"本当" => -2423,"毎日" => -2113,"目指" => -724,"B1あ" => 1404,"B1同" => 542,"」と" => 1682}
15
+ @BW2 = {".." => -11822,"11" => -669,"――" => -5730,"−−" => -13175,"いう" => -1609,"うか" => 2490,"かし" => -1350,"かも" => -602,"から" => -7194,"かれ" => 4612,"がい" => 853,"がら" => -3198,"きた" => 1941,"くな" => -1597,"こと" => -8392,"この" => -4193,"させ" => 4533,"され" => 13168,"さん" => -3977,"しい" => -1819,"しか" => -545,"した" => 5078,"して" => 972,"しな" => 939,"その" => -3744,"たい" => -1253,"たた" => -662,"ただ" => -3857,"たち" => -786,"たと" => 1224,"たは" => -939,"った" => 4589,"って" => 1647,"っと" => -2094,"てい" => 6144,"てき" => 3640,"てく" => 2551,"ては" => -3110,"ても" => -3065,"でい" => 2666,"でき" => -1528,"でし" => -3828,"です" => -4761,"でも" => -4203,"とい" => 1890,"とこ" => -1746,"とと" => -2279,"との" => 720,"とみ" => 5168,"とも" => -3941,"ない" => -2488,"なが" => -1313,"など" => -6509,"なの" => 2614,"なん" => 3099,"にお" => -1615,"にし" => 2748,"にな" => 2454,"によ" => -7236,"に対" => -14943,"に従" => -4688,"に関" => -11388,"のか" => 2093,"ので" => -7059,"のに" => -6041,"のの" => -6125,"はい" => 1073,"はが" => -1033,"はず" => -2532,"ばれ" => 1813,"まし" => -1316,"まで" => -6621,"まれ" => 5409,"めて" => -3153,"もい" => 2230,"もの" => -10713,"らか" => -944,"らし" => -1611,"らに" => -1897,"りし" => 651,"りま" => 1620,"れた" => 4270,"れて" => 849,"れば" => 4114,"ろう" => 6067,"われ" => 7901,"を通" => -11877,"んだ" => 728,"んな" => -4115,"一人" => 602,"一方" => -1375,"一日" => 970,"一部" => -1051,"上が" => -4479,"会社" => -1116,"出て" => 2163,"分の" => -7758,"同党" => 970,"同日" => -913,"大阪" => -2471,"委員" => -1250,"少な" => -1050,"年度" => -8669,"年間" => -1626,"府県" => -2363,"手権" => -1982,"新聞" => -4066,"日新" => -722,"日本" => -7068,"日米" => 3372,"曜日" => -601,"朝鮮" => -2355,"本人" => -2697,"東京" => -1543,"然と" => -1384,"社会" => -1276,"立て" => -990,"第に" => -1612,"米国" => -4268,"11" => -669}
16
+ @BW3 = {"あた" => -2194,"あり" => 719,"ある" => 3846,"い." => -1185,"い。" => -1185,"いい" => 5308,"いえ" => 2079,"いく" => 3029,"いた" => 2056,"いっ" => 1883,"いる" => 5600,"いわ" => 1527,"うち" => 1117,"うと" => 4798,"えと" => 1454,"か." => 2857,"か。" => 2857,"かけ" => -743,"かっ" => -4098,"かに" => -669,"から" => 6520,"かり" => -2670,"が," => 1816,"が、" => 1816,"がき" => -4855,"がけ" => -1127,"がっ" => -913,"がら" => -4977,"がり" => -2064,"きた" => 1645,"けど" => 1374,"こと" => 7397,"この" => 1542,"ころ" => -2757,"さい" => -714,"さを" => 976,"し," => 1557,"し、" => 1557,"しい" => -3714,"した" => 3562,"して" => 1449,"しな" => 2608,"しま" => 1200,"す." => -1310,"す。" => -1310,"する" => 6521,"ず," => 3426,"ず、" => 3426,"ずに" => 841,"そう" => 428,"た." => 8875,"た。" => 8875,"たい" => -594,"たの" => 812,"たり" => -1183,"たる" => -853,"だ." => 4098,"だ。" => 4098,"だっ" => 1004,"った" => -4748,"って" => 300,"てい" => 6240,"てお" => 855,"ても" => 302,"です" => 1437,"でに" => -1482,"では" => 2295,"とう" => -1387,"とし" => 2266,"との" => 541,"とも" => -3543,"どう" => 4664,"ない" => 1796,"なく" => -903,"など" => 2135,"に," => -1021,"に、" => -1021,"にし" => 1771,"にな" => 1906,"には" => 2644,"の," => -724,"の、" => -724,"の子" => -1000,"は," => 1337,"は、" => 1337,"べき" => 2181,"まし" => 1113,"ます" => 6943,"まっ" => -1549,"まで" => 6154,"まれ" => -793,"らし" => 1479,"られ" => 6820,"るる" => 3818,"れ," => 854,"れ、" => 854,"れた" => 1850,"れて" => 1375,"れば" => -3246,"れる" => 1091,"われ" => -605,"んだ" => 606,"んで" => 798,"カ月" => 990,"会議" => 860,"入り" => 1232,"大会" => 2217,"始め" => 1681,"市" => 965,"新聞" => -5055,"日," => 974,"日、" => 974,"社会" => 2024,"カ月" => 990}
17
+ @TC1 = {"AAA" => 1093,"HHH" => 1029,"HHM" => 580,"HII" => 998,"HOH" => -390,"HOM" => -331,"IHI" => 1169,"IOH" => -142,"IOI" => -1015,"IOM" => 467,"MMH" => 187,"OOI" => -1832}
18
+ @TC2 = {"HHO" => 2088,"HII" => -1023,"HMM" => -1154,"IHI" => -1965,"KKH" => 703,"OII" => -2649}
19
+ @TC3 = {"AAA" => -294,"HHH" => 346,"HHI" => -341,"HII" => -1088,"HIK" => 731,"HOH" => -1486,"IHH" => 128,"IHI" => -3041,"IHO" => -1935,"IIH" => -825,"IIM" => -1035,"IOI" => -542,"KHH" => -1216,"KKA" => 491,"KKH" => -1217,"KOK" => -1009,"MHH" => -2694,"MHM" => -457,"MHO" => 123,"MMH" => -471,"NNH" => -1689,"NNO" => 662,"OHO" => -3393}
20
+ @TC4 = {"HHH" => -203,"HHI" => 1344,"HHK" => 365,"HHM" => -122,"HHN" => 182,"HHO" => 669,"HIH" => 804,"HII" => 679,"HOH" => 446,"IHH" => 695,"IHO" => -2324,"IIH" => 321,"III" => 1497,"IIO" => 656,"IOO" => 54,"KAK" => 4845,"KKA" => 3386,"KKK" => 3065,"MHH" => -405,"MHI" => 201,"MMH" => -241,"MMM" => 661,"MOM" => 841}
21
+ @TQ1 = {"BHHH" => -227,"BHHI" => 316,"BHIH" => -132,"BIHH" => 60,"BIII" => 1595,"BNHH" => -744,"BOHH" => 225,"BOOO" => -908,"OAKK" => 482,"OHHH" => 281,"OHIH" => 249,"OIHI" => 200,"OIIH" => -68}
22
+ @TQ2 = {"BIHH" => -1401,"BIII" => -1033,"BKAK" => -543,"BOOO" => -5591}
23
+ @TQ3 = {"BHHH" => 478,"BHHM" => -1073,"BHIH" => 222,"BHII" => -504,"BIIH" => -116,"BIII" => -105,"BMHI" => -863,"BMHM" => -464,"BOMH" => 620,"OHHH" => 346,"OHHI" => 1729,"OHII" => 997,"OHMH" => 481,"OIHH" => 623,"OIIH" => 1344,"OKAK" => 2792,"OKHH" => 587,"OKKA" => 679,"OOHH" => 110,"OOII" => -685}
24
+ @TQ4 = {"BHHH" => -721,"BHHM" => -3604,"BHII" => -966,"BIIH" => -607,"BIII" => -2181,"OAAA" => -2763,"OAKK" => 180,"OHHH" => -294,"OHHI" => 2446,"OHHO" => 480,"OHIH" => -1573,"OIHH" => 1935,"OIHI" => -493,"OIIH" => 626,"OIII" => -4007,"OKAK" => -8156}
25
+ @TW1 = {"につい" => -4681,"東京都" => 2026}
26
+ @TW2 = {"ある程" => -2049,"いった" => -1256,"ころが" => -2434,"しょう" => 3873,"その後" => -4430,"だって" => -1049,"ていた" => 1833,"として" => -4657,"ともに" => -4517,"もので" => 1882,"一気に" => -792,"初めて" => -1512,"同時に" => -8097,"大きな" => -1255,"対して" => -2721,"社会党" => -3216}
27
+ @TW3 = {"いただ" => -1734,"してい" => 1314,"として" => -4314,"につい" => -5483,"にとっ" => -5989,"に当た" => -6247,"ので," => -727,"ので、" => -727,"のもの" => -600,"れから" => -3752,"十二月" => -2287}
28
+ @TW4 = {"いう." => 8576,"いう。" => 8576,"からな" => -2348,"してい" => 2958,"たが," => 1516,"たが、" => 1516,"ている" => 1538,"という" => 1349,"ました" => 5543,"ません" => 1097,"ようと" => -4258,"よると" => 5865}
29
+ @UC1 = {"A" => 484,"K" => 93,"M" => 645,"O" => -505}
30
+ @UC2 = {"A" => 819,"H" => 1059,"I" => 409,"M" => 3987,"N" => 5775,"O" => 646}
31
+ @UC3 = {"A" => -1370,"I" => 2311}
32
+ @UC4 = {"A" => -2643,"H" => 1809,"I" => -1032,"K" => -3450,"M" => 3565,"N" => 3876,"O" => 6646}
33
+ @UC5 = {"H" => 313,"I" => -1238,"K" => -799,"M" => 539,"O" => -831}
34
+ @UC6 = {"H" => -506,"I" => -253,"K" => 87,"M" => 247,"O" => -387}
35
+ @UP1 = {"O" => -214}
36
+ @UP2 = {"B" => 69,"O" => 935}
37
+ @UP3 = {"B" => 189}
38
+ @UQ1 = {"BH" => 21,"BI" => -12,"BK" => -99,"BN" => 142,"BO" => -56,"OH" => -95,"OI" => 477,"OK" => 410,"OO" => -2422}
39
+ @UQ2 = {"BH" => 216,"BI" => 113,"OK" => 1759}
40
+ @UQ3 = {"BA" => -479,"BH" => 42,"BI" => 1913,"BK" => -7198,"BM" => 3160,"BN" => 6427,"BO" => 14761,"OI" => -827,"ON" => -3212}
41
+ @UW1 = {"," => 156,"、" => 156,"「" => -463,"あ" => -941,"う" => -127,"が" => -553,"き" => 121,"こ" => 505,"で" => -201,"と" => -547,"ど" => -123,"に" => -789,"の" => -185,"は" => -847,"も" => -466,"や" => -470,"よ" => 182,"ら" => -292,"り" => 208,"れ" => 169,"を" => -446,"ん" => -137,"・" => -135,"主" => -402,"京" => -268,"区" => -912,"午" => 871,"国" => -460,"大" => 561,"委" => 729,"市" => -411,"日" => -141,"理" => 361,"生" => -408,"県" => -386,"都" => -718,"「" => -463,"・" => -135}
42
+ @UW2 = {"," => -829,"、" => -829,"〇" => 892,"「" => -645,"」" => 3145,"あ" => -538,"い" => 505,"う" => 134,"お" => -502,"か" => 1454,"が" => -856,"く" => -412,"こ" => 1141,"さ" => 878,"ざ" => 540,"し" => 1529,"す" => -675,"せ" => 300,"そ" => -1011,"た" => 188,"だ" => 1837,"つ" => -949,"て" => -291,"で" => -268,"と" => -981,"ど" => 1273,"な" => 1063,"に" => -1764,"の" => 130,"は" => -409,"ひ" => -1273,"べ" => 1261,"ま" => 600,"も" => -1263,"や" => -402,"よ" => 1639,"り" => -579,"る" => -694,"れ" => 571,"を" => -2516,"ん" => 2095,"ア" => -587,"カ" => 306,"キ" => 568,"ッ" => 831,"三" => -758,"不" => -2150,"世" => -302,"中" => -968,"主" => -861,"事" => 492,"人" => -123,"会" => 978,"保" => 362,"入" => 548,"初" => -3025,"副" => -1566,"北" => -3414,"区" => -422,"大" => -1769,"天" => -865,"太" => -483,"子" => -1519,"学" => 760,"実" => 1023,"小" => -2009,"市" => -813,"年" => -1060,"強" => 1067,"手" => -1519,"揺" => -1033,"政" => 1522,"文" => -1355,"新" => -1682,"日" => -1815,"明" => -1462,"最" => -630,"朝" => -1843,"本" => -1650,"東" => -931,"果" => -665,"次" => -2378,"民" => -180,"気" => -1740,"理" => 752,"発" => 529,"目" => -1584,"相" => -242,"県" => -1165,"立" => -763,"第" => 810,"米" => 509,"自" => -1353,"行" => 838,"西" => -744,"見" => -3874,"調" => 1010,"議" => 1198,"込" => 3041,"開" => 1758,"間" => -1257,"「" => -645,"」" => 3145,"ッ" => 831,"ア" => -587,"カ" => 306,"キ" => 568}
43
+ @UW3 = {"," => 4889,"1" => -800,"−" => -1723,"、" => 4889,"々" => -2311,"〇" => 5827,"」" => 2670,"〓" => -3573,"あ" => -2696,"い" => 1006,"う" => 2342,"え" => 1983,"お" => -4864,"か" => -1163,"が" => 3271,"く" => 1004,"け" => 388,"げ" => 401,"こ" => -3552,"ご" => -3116,"さ" => -1058,"し" => -395,"す" => 584,"せ" => 3685,"そ" => -5228,"た" => 842,"ち" => -521,"っ" => -1444,"つ" => -1081,"て" => 6167,"で" => 2318,"と" => 1691,"ど" => -899,"な" => -2788,"に" => 2745,"の" => 4056,"は" => 4555,"ひ" => -2171,"ふ" => -1798,"へ" => 1199,"ほ" => -5516,"ま" => -4384,"み" => -120,"め" => 1205,"も" => 2323,"や" => -788,"よ" => -202,"ら" => 727,"り" => 649,"る" => 5905,"れ" => 2773,"わ" => -1207,"を" => 6620,"ん" => -518,"ア" => 551,"グ" => 1319,"ス" => 874,"ッ" => -1350,"ト" => 521,"ム" => 1109,"ル" => 1591,"ロ" => 2201,"ン" => 278,"・" => -3794,"一" => -1619,"下" => -1759,"世" => -2087,"両" => 3815,"中" => 653,"主" => -758,"予" => -1193,"二" => 974,"人" => 2742,"今" => 792,"他" => 1889,"以" => -1368,"低" => 811,"何" => 4265,"作" => -361,"保" => -2439,"元" => 4858,"党" => 3593,"全" => 1574,"公" => -3030,"六" => 755,"共" => -1880,"円" => 5807,"再" => 3095,"分" => 457,"初" => 2475,"別" => 1129,"前" => 2286,"副" => 4437,"力" => 365,"動" => -949,"務" => -1872,"化" => 1327,"北" => -1038,"区" => 4646,"千" => -2309,"午" => -783,"協" => -1006,"口" => 483,"右" => 1233,"各" => 3588,"合" => -241,"同" => 3906,"和" => -837,"員" => 4513,"国" => 642,"型" => 1389,"場" => 1219,"外" => -241,"妻" => 2016,"学" => -1356,"安" => -423,"実" => -1008,"家" => 1078,"小" => -513,"少" => -3102,"州" => 1155,"市" => 3197,"平" => -1804,"年" => 2416,"広" => -1030,"府" => 1605,"度" => 1452,"建" => -2352,"当" => -3885,"得" => 1905,"思" => -1291,"性" => 1822,"戸" => -488,"指" => -3973,"政" => -2013,"教" => -1479,"数" => 3222,"文" => -1489,"新" => 1764,"日" => 2099,"旧" => 5792,"昨" => -661,"時" => -1248,"曜" => -951,"最" => -937,"月" => 4125,"期" => 360,"李" => 3094,"村" => 364,"東" => -805,"核" => 5156,"森" => 2438,"業" => 484,"氏" => 2613,"民" => -1694,"決" => -1073,"法" => 1868,"海" => -495,"無" => 979,"物" => 461,"特" => -3850,"生" => -273,"用" => 914,"町" => 1215,"的" => 7313,"直" => -1835,"省" => 792,"県" => 6293,"知" => -1528,"私" => 4231,"税" => 401,"立" => -960,"第" => 1201,"米" => 7767,"系" => 3066,"約" => 3663,"級" => 1384,"統" => -4229,"総" => 1163,"線" => 1255,"者" => 6457,"能" => 725,"自" => -2869,"英" => 785,"見" => 1044,"調" => -562,"財" => -733,"費" => 1777,"車" => 1835,"軍" => 1375,"込" => -1504,"通" => -1136,"選" => -681,"郎" => 1026,"郡" => 4404,"部" => 1200,"金" => 2163,"長" => 421,"開" => -1432,"間" => 1302,"関" => -1282,"雨" => 2009,"電" => -1045,"非" => 2066,"駅" => 1620,"1" => -800,"」" => 2670,"・" => -3794,"ッ" => -1350,"ア" => 551,"グ" => 1319,"ス" => 874,"ト" => 521,"ム" => 1109,"ル" => 1591,"ロ" => 2201,"ン" => 278}
44
+ @UW4 = {"," => 3930,"." => 3508,"―" => -4841,"、" => 3930,"。" => 3508,"〇" => 4999,"「" => 1895,"」" => 3798,"〓" => -5156,"あ" => 4752,"い" => -3435,"う" => -640,"え" => -2514,"お" => 2405,"か" => 530,"が" => 6006,"き" => -4482,"ぎ" => -3821,"く" => -3788,"け" => -4376,"げ" => -4734,"こ" => 2255,"ご" => 1979,"さ" => 2864,"し" => -843,"じ" => -2506,"す" => -731,"ず" => 1251,"せ" => 181,"そ" => 4091,"た" => 5034,"だ" => 5408,"ち" => -3654,"っ" => -5882,"つ" => -1659,"て" => 3994,"で" => 7410,"と" => 4547,"な" => 5433,"に" => 6499,"ぬ" => 1853,"ね" => 1413,"の" => 7396,"は" => 8578,"ば" => 1940,"ひ" => 4249,"び" => -4134,"ふ" => 1345,"へ" => 6665,"べ" => -744,"ほ" => 1464,"ま" => 1051,"み" => -2082,"む" => -882,"め" => -5046,"も" => 4169,"ゃ" => -2666,"や" => 2795,"ょ" => -1544,"よ" => 3351,"ら" => -2922,"り" => -9726,"る" => -14896,"れ" => -2613,"ろ" => -4570,"わ" => -1783,"を" => 13150,"ん" => -2352,"カ" => 2145,"コ" => 1789,"セ" => 1287,"ッ" => -724,"ト" => -403,"メ" => -1635,"ラ" => -881,"リ" => -541,"ル" => -856,"ン" => -3637,"・" => -4371,"ー" => -11870,"一" => -2069,"中" => 2210,"予" => 782,"事" => -190,"井" => -1768,"人" => 1036,"以" => 544,"会" => 950,"体" => -1286,"作" => 530,"側" => 4292,"先" => 601,"党" => -2006,"共" => -1212,"内" => 584,"円" => 788,"初" => 1347,"前" => 1623,"副" => 3879,"力" => -302,"動" => -740,"務" => -2715,"化" => 776,"区" => 4517,"協" => 1013,"参" => 1555,"合" => -1834,"和" => -681,"員" => -910,"器" => -851,"回" => 1500,"国" => -619,"園" => -1200,"地" => 866,"場" => -1410,"塁" => -2094,"士" => -1413,"多" => 1067,"大" => 571,"子" => -4802,"学" => -1397,"定" => -1057,"寺" => -809,"小" => 1910,"屋" => -1328,"山" => -1500,"島" => -2056,"川" => -2667,"市" => 2771,"年" => 374,"庁" => -4556,"後" => 456,"性" => 553,"感" => 916,"所" => -1566,"支" => 856,"改" => 787,"政" => 2182,"教" => 704,"文" => 522,"方" => -856,"日" => 1798,"時" => 1829,"最" => 845,"月" => -9066,"木" => -485,"来" => -442,"校" => -360,"業" => -1043,"氏" => 5388,"民" => -2716,"気" => -910,"沢" => -939,"済" => -543,"物" => -735,"率" => 672,"球" => -1267,"生" => -1286,"産" => -1101,"田" => -2900,"町" => 1826,"的" => 2586,"目" => 922,"省" => -3485,"県" => 2997,"空" => -867,"立" => -2112,"第" => 788,"米" => 2937,"系" => 786,"約" => 2171,"経" => 1146,"統" => -1169,"総" => 940,"線" => -994,"署" => 749,"者" => 2145,"能" => -730,"般" => -852,"行" => -792,"規" => 792,"警" => -1184,"議" => -244,"谷" => -1000,"賞" => 730,"車" => -1481,"軍" => 1158,"輪" => -1433,"込" => -3370,"近" => 929,"道" => -1291,"選" => 2596,"郎" => -4866,"都" => 1192,"野" => -1100,"銀" => -2213,"長" => 357,"間" => -2344,"院" => -2297,"際" => -2604,"電" => -878,"領" => -1659,"題" => -792,"館" => -1984,"首" => 1749,"高" => 2120,"「" => 1895,"」" => 3798,"・" => -4371,"ッ" => -724,"ー" => -11870,"カ" => 2145,"コ" => 1789,"セ" => 1287,"ト" => -403,"メ" => -1635,"ラ" => -881,"リ" => -541,"ル" => -856,"ン" => -3637}
45
+ @UW5 = {"," => 465,"." => -299,"1" => -514,"E2" => -32768,"]" => -2762,"、" => 465,"。" => -299,"「" => 363,"あ" => 1655,"い" => 331,"う" => -503,"え" => 1199,"お" => 527,"か" => 647,"が" => -421,"き" => 1624,"ぎ" => 1971,"く" => 312,"げ" => -983,"さ" => -1537,"し" => -1371,"す" => -852,"だ" => -1186,"ち" => 1093,"っ" => 52,"つ" => 921,"て" => -18,"で" => -850,"と" => -127,"ど" => 1682,"な" => -787,"に" => -1224,"の" => -635,"は" => -578,"べ" => 1001,"み" => 502,"め" => 865,"ゃ" => 3350,"ょ" => 854,"り" => -208,"る" => 429,"れ" => 504,"わ" => 419,"を" => -1264,"ん" => 327,"イ" => 241,"ル" => 451,"ン" => -343,"中" => -871,"京" => 722,"会" => -1153,"党" => -654,"務" => 3519,"区" => -901,"告" => 848,"員" => 2104,"大" => -1296,"学" => -548,"定" => 1785,"嵐" => -1304,"市" => -2991,"席" => 921,"年" => 1763,"思" => 872,"所" => -814,"挙" => 1618,"新" => -1682,"日" => 218,"月" => -4353,"査" => 932,"格" => 1356,"機" => -1508,"氏" => -1347,"田" => 240,"町" => -3912,"的" => -3149,"相" => 1319,"省" => -1052,"県" => -4003,"研" => -997,"社" => -278,"空" => -813,"統" => 1955,"者" => -2233,"表" => 663,"語" => -1073,"議" => 1219,"選" => -1018,"郎" => -368,"長" => 786,"間" => 1191,"題" => 2368,"館" => -689,"1" => -514,"E2" => -32768,"「" => 363,"イ" => 241,"ル" => 451,"ン" => -343}
46
+ @UW6 = {"," => 227,"." => 808,"1" => -270,"E1" => 306,"、" => 227,"。" => 808,"あ" => -307,"う" => 189,"か" => 241,"が" => -73,"く" => -121,"こ" => -200,"じ" => 1782,"す" => 383,"た" => -428,"っ" => 573,"て" => -1014,"で" => 101,"と" => -105,"な" => -253,"に" => -149,"の" => -417,"は" => -236,"も" => -206,"り" => 187,"る" => -135,"を" => 195,"ル" => -673,"ン" => -496,"一" => -277,"中" => 201,"件" => -800,"会" => 624,"前" => 302,"区" => 1792,"員" => -1212,"委" => 798,"学" => -960,"市" => 887,"広" => -695,"後" => 535,"業" => -697,"相" => 753,"社" => -507,"福" => 974,"空" => -822,"者" => 1811,"連" => 463,"郎" => 1082,"1" => -270,"E1" => 306,"ル" => -673,"ン" => -496}
47
+ end
48
+
49
+ def score(category, pattern)
50
+ instance_variable_get("@#{category.to_s}")[pattern] || 0
51
+ end
52
+ end
@@ -0,0 +1,3 @@
1
+ class TinySegmenter
2
+ VERSION = "0.0.2"
3
+ end
@@ -0,0 +1,13 @@
1
+ require 'tiny_segmenter'
2
+
3
+ # This file was generated by the `rspec --init` command. Conventionally, all
4
+ # specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
5
+ # Require this file using `require "spec_helper.rb"` to ensure that it is only
6
+ # loaded once.
7
+ #
8
+ # See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
9
+ RSpec.configure do |config|
10
+ config.treat_symbols_as_metadata_keys_with_true_values = true
11
+ config.run_all_when_everything_filtered = true
12
+ config.filter_run :focus
13
+ end
@@ -0,0 +1,23 @@
1
+ #encoding: utf-8
2
+ require 'spec_helper'
3
+
4
+ describe TinySegmenter do
5
+ subject{ TinySegmenter.new }
6
+
7
+ it "tokenizes Japanese text fairly accurately" do
8
+ subject.segment("極めてコンパクトな日本語分かち書きソフトウェアです。").should == \
9
+ ["極めて", "コンパクト", "な", "日本", "語分", "かち", "書き", "ソフトウェア", "です", "。"]
10
+ end
11
+
12
+ it "removes any whitespace-only or empty tokens" do
13
+ subject.segment("書かれた 極めて コンパクト").should_not include("", " ")
14
+ end
15
+
16
+ it "tokenizes interspersed non-Japanese words correctly" do
17
+ subject.segment("TinySegmenterはRubyだけで").should == ["TinySegmenter", "は", "Ruby", "だけ", "で"]
18
+ end
19
+
20
+ it "has a version" do
21
+ TinySegmenter::VERSION.should_not be_empty
22
+ end
23
+ end
@@ -0,0 +1,20 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require 'tiny_segmenter/version'
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = 'tiny_segmenter'
7
+ s.version = TinySegmenter::VERSION
8
+ s.date = '2012-08-27'
9
+ s.summary = "Ruby port of TinySegmenter.js for tokenizing Japanese text."
10
+ s.description = "Ruby port of TinySegmenter.js for tokenizing Japanese text."
11
+ s.authors = ["Peter Graham"]
12
+ s.email = ["pete@gigadrill.com"]
13
+ s.files = `git ls-files`.split("\n")
14
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
15
+ s.require_paths = ["lib"]
16
+ s.homepage = 'http://github.com/6/tiny_segmenter'
17
+
18
+ s.add_development_dependency "rake"
19
+ s.add_development_dependency "rspec"
20
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tiny_segmenter
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,8 +9,40 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-08-19 00:00:00.000000000 Z
13
- dependencies: []
12
+ date: 2012-08-27 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: rake
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :development
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: rspec
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :development
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
14
46
  description: Ruby port of TinySegmenter.js for tokenizing Japanese text.
15
47
  email:
16
48
  - pete@gigadrill.com
@@ -18,7 +50,16 @@ executables: []
18
50
  extensions: []
19
51
  extra_rdoc_files: []
20
52
  files:
53
+ - .gitignore
54
+ - .rspec
55
+ - Gemfile
56
+ - README.md
21
57
  - lib/tiny_segmenter.rb
58
+ - lib/tiny_segmenter/segmentation_model.rb
59
+ - lib/tiny_segmenter/version.rb
60
+ - spec/spec_helper.rb
61
+ - spec/tiny_segmenter_spec.rb
62
+ - tiny_segmenter.gemspec
22
63
  homepage: http://github.com/6/tiny_segmenter
23
64
  licenses: []
24
65
  post_install_message:
@@ -43,4 +84,6 @@ rubygems_version: 1.8.21
43
84
  signing_key:
44
85
  specification_version: 3
45
86
  summary: Ruby port of TinySegmenter.js for tokenizing Japanese text.
46
- test_files: []
87
+ test_files:
88
+ - spec/spec_helper.rb
89
+ - spec/tiny_segmenter_spec.rb