r_nlp 0.1.7 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 6b168605a8d40c7056cb3ecb2a94bb2c0b195138
4
- data.tar.gz: 32287c8ac286bdf93175b1844145a611dc4cc6ec
3
+ metadata.gz: 3276858a8798293156c1ac8a93f2025ecf92dd99
4
+ data.tar.gz: f6d1c3523a059bbe0f6ec3bbb1bd2bc74ee6f17f
5
5
  SHA512:
6
- metadata.gz: 53fc55794f207063e4ca5c7a2e3137469ba07cb450e5f04c43ecad7bc16ca095611b1a78e9e7a843c7592b3eb1df7ed9a54d6874ecf7804bd620857a990a5ce8
7
- data.tar.gz: ffad3007c059a3086b6bf29dd29b818a5480f209f8f5c1c5791e8c24e2027bea4ac13fa11366e14cd0f5897e51d76a5be5976267b72bd83ef6bdc02e2fb54afe
6
+ metadata.gz: e909515691ee536b1291b15349304c3e950bca0fcca159f1c20cecdc578cfafebd6f7e2947007bc76db2e3a6f3afa498f79f87f647399a6e776311c0826ea1f0
7
+ data.tar.gz: cfb67b214090d0ca9c8f9ab0d0b1a1f74cdcdf982c1eb8350dff9dc4ff35966c5b520fd0ad910eae29857a135fa55a2b99726a28553cb27d574674dbb5cf84d1
data/.gitignore CHANGED
@@ -9,4 +9,3 @@
9
9
  /tmp/
10
10
  .*
11
11
  !.gitignore
12
- hoge.rb
@@ -1,6 +1,7 @@
1
1
  require "r_nlp/version"
2
2
  require "r_nlp/tf"
3
3
  require 'r_nlp/idf'
4
+ require 'r_nlp/tokenize'
4
5
 
5
6
  module RNlp
6
7
  # Your code goes here...
@@ -1,4 +1,5 @@
1
1
  # -*- coding: utf-8 -*-
2
+
2
3
  require 'natto'
3
4
 
4
5
  module RNlp
@@ -0,0 +1,30 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ require 'natto'
4
+
5
+ module RNlp
6
+ # it copes only with Japanese
7
+ class Tokenize
8
+ def tokenize(input)
9
+ natto = Natto::MeCab.new
10
+ # array for token
11
+ token = Array.new
12
+ # make morphological analysis
13
+ natto.parse(input) do |n|
14
+ # word surface and word speech tag
15
+ surface = n.surface
16
+ tag = n.feature.split(',')[0]
17
+ # 単語が(.||。)の場合は['。', '記号']をpush, それ以外の場合は単語の表出系と品詞タグをpush
18
+ if tag == '助動詞'
19
+ token[token.size-1][0] += surface
20
+ else
21
+ (surface != nil) ? token.push([surface, tag]) : token.push(['。', '記号']) if(surface != '。' && surface != '.')
22
+ end
23
+ end
24
+ if token[token.size-1][0] == '。'
25
+ token.pop
26
+ end
27
+ return token
28
+ end
29
+ end
30
+ end
@@ -1,3 +1,3 @@
1
1
  module RNlp
2
- VERSION = "0.1.7"
2
+ VERSION = "0.1.8"
3
3
  end
data/test.rb ADDED
@@ -0,0 +1,20 @@
1
+ require 'r_nlp'
2
+
3
+ a = RNlp::Tf.new('ja')
4
+ text = '私は誰だ'
5
+
6
+ p a.count(text)
7
+
8
+ b = RNlp::Tf.new('en')
9
+ text = 'who are you ?'
10
+
11
+ p b.count(text)
12
+
13
+ c = ['text 1 is hoge', 'text 2 is yeah', 'text 3 is hoge']
14
+
15
+ idf = RNlp::Idf.new('en')
16
+ p idf.calc_idf('hoge', c)
17
+
18
+ str = 'これはテストです'
19
+ morph = RNlp::Tokenize.new
20
+ p morph.tokenize(str)
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: r_nlp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.7
4
+ version: 0.1.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - himkt
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2015-02-25 00:00:00.000000000 Z
11
+ date: 2015-03-03 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: natto
@@ -72,8 +72,10 @@ files:
72
72
  - lib/r_nlp.rb
73
73
  - lib/r_nlp/idf.rb
74
74
  - lib/r_nlp/tf.rb
75
+ - lib/r_nlp/tokenize.rb
75
76
  - lib/r_nlp/version.rb
76
77
  - r_nlp.gemspec
78
+ - test.rb
77
79
  homepage: https://github.com/himkt/r_nlp
78
80
  licenses:
79
81
  - MIT