r_nlp 0.1.7 → 0.1.8

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 6b168605a8d40c7056cb3ecb2a94bb2c0b195138
4
- data.tar.gz: 32287c8ac286bdf93175b1844145a611dc4cc6ec
3
+ metadata.gz: 3276858a8798293156c1ac8a93f2025ecf92dd99
4
+ data.tar.gz: f6d1c3523a059bbe0f6ec3bbb1bd2bc74ee6f17f
5
5
  SHA512:
6
- metadata.gz: 53fc55794f207063e4ca5c7a2e3137469ba07cb450e5f04c43ecad7bc16ca095611b1a78e9e7a843c7592b3eb1df7ed9a54d6874ecf7804bd620857a990a5ce8
7
- data.tar.gz: ffad3007c059a3086b6bf29dd29b818a5480f209f8f5c1c5791e8c24e2027bea4ac13fa11366e14cd0f5897e51d76a5be5976267b72bd83ef6bdc02e2fb54afe
6
+ metadata.gz: e909515691ee536b1291b15349304c3e950bca0fcca159f1c20cecdc578cfafebd6f7e2947007bc76db2e3a6f3afa498f79f87f647399a6e776311c0826ea1f0
7
+ data.tar.gz: cfb67b214090d0ca9c8f9ab0d0b1a1f74cdcdf982c1eb8350dff9dc4ff35966c5b520fd0ad910eae29857a135fa55a2b99726a28553cb27d574674dbb5cf84d1
data/.gitignore CHANGED
@@ -9,4 +9,3 @@
9
9
  /tmp/
10
10
  .*
11
11
  !.gitignore
12
- hoge.rb
@@ -1,6 +1,7 @@
1
1
  require "r_nlp/version"
2
2
  require "r_nlp/tf"
3
3
  require 'r_nlp/idf'
4
+ require 'r_nlp/tokenize'
4
5
 
5
6
  module RNlp
6
7
  # Your code goes here...
@@ -1,4 +1,5 @@
1
1
  # -*- coding: utf-8 -*-
2
+
2
3
  require 'natto'
3
4
 
4
5
  module RNlp
@@ -0,0 +1,30 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ require 'natto'
4
+
5
+ module RNlp
6
+ # it copes only with Japanese
7
+ class Tokenize
8
+ def tokenize(input)
9
+ natto = Natto::MeCab.new
10
+ # array for token
11
+ token = Array.new
12
+ # make morphological analysis
13
+ natto.parse(input) do |n|
14
+ # word surface and word speech tag
15
+ surface = n.surface
16
+ tag = n.feature.split(',')[0]
17
+ # 単語が(.||。)の場合は['。', '記号']をpush, それ以外の場合は単語の表出系と品詞タグをpush
18
+ if tag == '助動詞'
19
+ token[token.size-1][0] += surface
20
+ else
21
+ (surface != nil) ? token.push([surface, tag]) : token.push(['。', '記号']) if(surface != '。' && surface != '.')
22
+ end
23
+ end
24
+ if token[token.size-1][0] == '。'
25
+ token.pop
26
+ end
27
+ return token
28
+ end
29
+ end
30
+ end
@@ -1,3 +1,3 @@
1
1
  module RNlp
2
- VERSION = "0.1.7"
2
+ VERSION = "0.1.8"
3
3
  end
data/test.rb ADDED
@@ -0,0 +1,20 @@
1
+ require 'r_nlp'
2
+
3
+ a = RNlp::Tf.new('ja')
4
+ text = '私は誰だ'
5
+
6
+ p a.count(text)
7
+
8
+ b = RNlp::Tf.new('en')
9
+ text = 'who are you ?'
10
+
11
+ p b.count(text)
12
+
13
+ c = ['text 1 is hoge', 'text 2 is yeah', 'text 3 is hoge']
14
+
15
+ idf = RNlp::Idf.new('en')
16
+ p idf.calc_idf('hoge', c)
17
+
18
+ str = 'これはテストです'
19
+ morph = RNlp::Tokenize.new
20
+ p morph.tokenize(str)
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: r_nlp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.7
4
+ version: 0.1.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - himkt
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2015-02-25 00:00:00.000000000 Z
11
+ date: 2015-03-03 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: natto
@@ -72,8 +72,10 @@ files:
72
72
  - lib/r_nlp.rb
73
73
  - lib/r_nlp/idf.rb
74
74
  - lib/r_nlp/tf.rb
75
+ - lib/r_nlp/tokenize.rb
75
76
  - lib/r_nlp/version.rb
76
77
  - r_nlp.gemspec
78
+ - test.rb
77
79
  homepage: https://github.com/himkt/r_nlp
78
80
  licenses:
79
81
  - MIT