r_nlp 0.1.7 → 0.1.8
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +0 -1
- data/lib/r_nlp.rb +1 -0
- data/lib/r_nlp/tf.rb +1 -0
- data/lib/r_nlp/tokenize.rb +30 -0
- data/lib/r_nlp/version.rb +1 -1
- data/test.rb +20 -0
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3276858a8798293156c1ac8a93f2025ecf92dd99
|
4
|
+
data.tar.gz: f6d1c3523a059bbe0f6ec3bbb1bd2bc74ee6f17f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e909515691ee536b1291b15349304c3e950bca0fcca159f1c20cecdc578cfafebd6f7e2947007bc76db2e3a6f3afa498f79f87f647399a6e776311c0826ea1f0
|
7
|
+
data.tar.gz: cfb67b214090d0ca9c8f9ab0d0b1a1f74cdcdf982c1eb8350dff9dc4ff35966c5b520fd0ad910eae29857a135fa55a2b99726a28553cb27d574674dbb5cf84d1
|
data/.gitignore
CHANGED
data/lib/r_nlp.rb
CHANGED
data/lib/r_nlp/tf.rb
CHANGED
@@ -0,0 +1,30 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
require 'natto'
|
4
|
+
|
5
|
+
module RNlp
|
6
|
+
# it copes only with Japanese
|
7
|
+
class Tokenize
|
8
|
+
def tokenize(input)
|
9
|
+
natto = Natto::MeCab.new
|
10
|
+
# array for token
|
11
|
+
token = Array.new
|
12
|
+
# make morphological analysis
|
13
|
+
natto.parse(input) do |n|
|
14
|
+
# word surface and word speech tag
|
15
|
+
surface = n.surface
|
16
|
+
tag = n.feature.split(',')[0]
|
17
|
+
# 単語が(.||。)の場合は['。', '記号']をpush, それ以外の場合は単語の表出系と品詞タグをpush
|
18
|
+
if tag == '助動詞'
|
19
|
+
token[token.size-1][0] += surface
|
20
|
+
else
|
21
|
+
(surface != nil) ? token.push([surface, tag]) : token.push(['。', '記号']) if(surface != '。' && surface != '.')
|
22
|
+
end
|
23
|
+
end
|
24
|
+
if token[token.size-1][0] == '。'
|
25
|
+
token.pop
|
26
|
+
end
|
27
|
+
return token
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
data/lib/r_nlp/version.rb
CHANGED
data/test.rb
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
require 'r_nlp'
|
2
|
+
|
3
|
+
a = RNlp::Tf.new('ja')
|
4
|
+
text = '私は誰だ'
|
5
|
+
|
6
|
+
p a.count(text)
|
7
|
+
|
8
|
+
b = RNlp::Tf.new('en')
|
9
|
+
text = 'who are you ?'
|
10
|
+
|
11
|
+
p b.count(text)
|
12
|
+
|
13
|
+
c = ['text 1 is hoge', 'text 2 is yeah', 'text 3 is hoge']
|
14
|
+
|
15
|
+
idf = RNlp::Idf.new('en')
|
16
|
+
p idf.calc_idf('hoge', c)
|
17
|
+
|
18
|
+
str = 'これはテストです'
|
19
|
+
morph = RNlp::Tokenize.new
|
20
|
+
p morph.tokenize(str)
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: r_nlp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.8
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- himkt
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-03-03 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: natto
|
@@ -72,8 +72,10 @@ files:
|
|
72
72
|
- lib/r_nlp.rb
|
73
73
|
- lib/r_nlp/idf.rb
|
74
74
|
- lib/r_nlp/tf.rb
|
75
|
+
- lib/r_nlp/tokenize.rb
|
75
76
|
- lib/r_nlp/version.rb
|
76
77
|
- r_nlp.gemspec
|
78
|
+
- test.rb
|
77
79
|
homepage: https://github.com/himkt/r_nlp
|
78
80
|
licenses:
|
79
81
|
- MIT
|