r_nlp 0.1.7 → 0.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +0 -1
- data/lib/r_nlp.rb +1 -0
- data/lib/r_nlp/tf.rb +1 -0
- data/lib/r_nlp/tokenize.rb +30 -0
- data/lib/r_nlp/version.rb +1 -1
- data/test.rb +20 -0
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3276858a8798293156c1ac8a93f2025ecf92dd99
|
4
|
+
data.tar.gz: f6d1c3523a059bbe0f6ec3bbb1bd2bc74ee6f17f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e909515691ee536b1291b15349304c3e950bca0fcca159f1c20cecdc578cfafebd6f7e2947007bc76db2e3a6f3afa498f79f87f647399a6e776311c0826ea1f0
|
7
|
+
data.tar.gz: cfb67b214090d0ca9c8f9ab0d0b1a1f74cdcdf982c1eb8350dff9dc4ff35966c5b520fd0ad910eae29857a135fa55a2b99726a28553cb27d574674dbb5cf84d1
|
data/.gitignore
CHANGED
data/lib/r_nlp.rb
CHANGED
data/lib/r_nlp/tf.rb
CHANGED
@@ -0,0 +1,30 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
require 'natto'
|
4
|
+
|
5
|
+
module RNlp
|
6
|
+
# it copes only with Japanese
|
7
|
+
class Tokenize
|
8
|
+
def tokenize(input)
|
9
|
+
natto = Natto::MeCab.new
|
10
|
+
# array for token
|
11
|
+
token = Array.new
|
12
|
+
# make morphological analysis
|
13
|
+
natto.parse(input) do |n|
|
14
|
+
# word surface and word speech tag
|
15
|
+
surface = n.surface
|
16
|
+
tag = n.feature.split(',')[0]
|
17
|
+
# 単語が(.||。)の場合は['。', '記号']をpush, それ以外の場合は単語の表出系と品詞タグをpush
|
18
|
+
if tag == '助動詞'
|
19
|
+
token[token.size-1][0] += surface
|
20
|
+
else
|
21
|
+
(surface != nil) ? token.push([surface, tag]) : token.push(['。', '記号']) if(surface != '。' && surface != '.')
|
22
|
+
end
|
23
|
+
end
|
24
|
+
if token[token.size-1][0] == '。'
|
25
|
+
token.pop
|
26
|
+
end
|
27
|
+
return token
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
data/lib/r_nlp/version.rb
CHANGED
data/test.rb
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
require 'r_nlp'
|
2
|
+
|
3
|
+
a = RNlp::Tf.new('ja')
|
4
|
+
text = '私は誰だ'
|
5
|
+
|
6
|
+
p a.count(text)
|
7
|
+
|
8
|
+
b = RNlp::Tf.new('en')
|
9
|
+
text = 'who are you ?'
|
10
|
+
|
11
|
+
p b.count(text)
|
12
|
+
|
13
|
+
c = ['text 1 is hoge', 'text 2 is yeah', 'text 3 is hoge']
|
14
|
+
|
15
|
+
idf = RNlp::Idf.new('en')
|
16
|
+
p idf.calc_idf('hoge', c)
|
17
|
+
|
18
|
+
str = 'これはテストです'
|
19
|
+
morph = RNlp::Tokenize.new
|
20
|
+
p morph.tokenize(str)
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: r_nlp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.8
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- himkt
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-03-03 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: natto
|
@@ -72,8 +72,10 @@ files:
|
|
72
72
|
- lib/r_nlp.rb
|
73
73
|
- lib/r_nlp/idf.rb
|
74
74
|
- lib/r_nlp/tf.rb
|
75
|
+
- lib/r_nlp/tokenize.rb
|
75
76
|
- lib/r_nlp/version.rb
|
76
77
|
- r_nlp.gemspec
|
78
|
+
- test.rb
|
77
79
|
homepage: https://github.com/himkt/r_nlp
|
78
80
|
licenses:
|
79
81
|
- MIT
|