RubyGems - r_nlp - Versions diffs - 0.1.7 → 0.1.8 - Mend

r_nlp 0.1.7 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 6b168605a8d40c7056cb3ecb2a94bb2c0b195138
-  data.tar.gz: 32287c8ac286bdf93175b1844145a611dc4cc6ec
+  metadata.gz: 3276858a8798293156c1ac8a93f2025ecf92dd99
+  data.tar.gz: f6d1c3523a059bbe0f6ec3bbb1bd2bc74ee6f17f
 SHA512:
-  metadata.gz: 53fc55794f207063e4ca5c7a2e3137469ba07cb450e5f04c43ecad7bc16ca095611b1a78e9e7a843c7592b3eb1df7ed9a54d6874ecf7804bd620857a990a5ce8
-  data.tar.gz: ffad3007c059a3086b6bf29dd29b818a5480f209f8f5c1c5791e8c24e2027bea4ac13fa11366e14cd0f5897e51d76a5be5976267b72bd83ef6bdc02e2fb54afe
+  metadata.gz: e909515691ee536b1291b15349304c3e950bca0fcca159f1c20cecdc578cfafebd6f7e2947007bc76db2e3a6f3afa498f79f87f647399a6e776311c0826ea1f0
+  data.tar.gz: cfb67b214090d0ca9c8f9ab0d0b1a1f74cdcdf982c1eb8350dff9dc4ff35966c5b520fd0ad910eae29857a135fa55a2b99726a28553cb27d574674dbb5cf84d1

data/.gitignore CHANGED

@@ -9,4 +9,3 @@
 /tmp/
 .*
 !.gitignore
-hoge.rb

data/lib/r_nlp.rb CHANGED

@@ -1,6 +1,7 @@
 require "r_nlp/version"
 require "r_nlp/tf"
 require 'r_nlp/idf'
+require 'r_nlp/tokenize'
 module RNlp
   # Your code goes here...

data/lib/r_nlp/tf.rb CHANGED

@@ -1,4 +1,5 @@
 # -*- coding: utf-8 -*-
 require 'natto'
 module RNlp

data/lib/r_nlp/tokenize.rb ADDED

@@ -0,0 +1,30 @@
+# -*- encoding: utf-8 -*-
+require 'natto'
+module RNlp
+  # it copes only with Japanese
+  class Tokenize
+    def tokenize(input)
+      natto = Natto::MeCab.new
+      # array for token
+      token = Array.new
+      # make morphological analysis
+      natto.parse(input) do |n|
+        # word surface and word speech tag
+        surface = n.surface
+        tag = n.feature.split(',')[0]
+        # 単語が(.||。)の場合は['。', '記号']をpush, それ以外の場合は単語の表出系と品詞タグをpush
+        if tag == '助動詞'
+          token[token.size-1][0] += surface
+        else
+          (surface != nil) ? token.push([surface, tag]) : token.push(['。', '記号']) if(surface != '。' && surface != '.')
+        end
+      end
+      if token[token.size-1][0] == '。'
+        token.pop
+      end
+      return token
+    end
+  end
+end

data/lib/r_nlp/version.rb CHANGED

@@ -1,3 +1,3 @@
 module RNlp
-  VERSION = "0.1.7"
+  VERSION = "0.1.8"
 end

data/test.rb ADDED

@@ -0,0 +1,20 @@
+require 'r_nlp'
+a = RNlp::Tf.new('ja')
+text = '私は誰だ'
+p a.count(text)
+b = RNlp::Tf.new('en')
+text = 'who are you ?'
+p b.count(text)
+c = ['text 1 is hoge', 'text 2 is yeah', 'text 3 is hoge']
+idf = RNlp::Idf.new('en')
+p idf.calc_idf('hoge', c)
+str = 'これはテストです'
+morph = RNlp::Tokenize.new
+p morph.tokenize(str)

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: r_nlp
 version: !ruby/object:Gem::Version
-  version: 0.1.7
+  version: 0.1.8
 platform: ruby
 authors:
 - himkt
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2015-02-25 00:00:00.000000000 Z
+date: 2015-03-03 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: natto
@@ -72,8 +72,10 @@ files:
 - lib/r_nlp.rb
 - lib/r_nlp/idf.rb
 - lib/r_nlp/tf.rb
+- lib/r_nlp/tokenize.rb
 - lib/r_nlp/version.rb
 - r_nlp.gemspec
+- test.rb
 homepage: https://github.com/himkt/r_nlp
 licenses:
 - MIT