bunpa 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/README.md +57 -0
- data/spec/parser_spec.rb +31 -0
- data/spec/spec_helper.rb +20 -0
- metadata +78 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: b68a4b60aa085901f9797f5beea784f11ac322ac
|
4
|
+
data.tar.gz: 08ab437798631a5f11b0a8d6b4311345152d31c7
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 1ae1d695acfb725315d2973409f2f0cb8d9c4d4951258a779c90d1e960c42bfb4f67eb4a78c719cf90ad00f3146fde10edac7fc63402ac52fa7b07e22472096e
|
7
|
+
data.tar.gz: 6c2715bfba32f2c9eac30f07de1f03d868ed1a1d45158f8b46b40161a31fcc00d6e66e1149bcdc84a028e69c60af8dc14d1e592aedf1bd36a08d6204d46119a8
|
data/README.md
ADDED
@@ -0,0 +1,57 @@
|
|
1
|
+
Bunpa
|
2
|
+
==========================
|
3
|
+
|
4
|
+
Bunpa is an extremely simple wrapper around the MeCab Japanese grammar parser. It was designed with two key features in mind:
|
5
|
+
|
6
|
+
1. Simplicity - only returns the text and major part of speech for each component
|
7
|
+
2. Completeness - ensure that whitespace and any unknown characters are preserved
|
8
|
+
|
9
|
+
## Background
|
10
|
+
|
11
|
+
Bunpa parses Japanese text into a set of ordered components. Each component represents either a part of speech (noun, verb, etc.) or formatting (whitespace, etc.) All components have a text value (exactly as they appear in the text provided) and kind (usually part of speech).
|
12
|
+
|
13
|
+
All grammatical information is provided by the excellent [MeCab](http://mecab.googlecode.com/svn/trunk/mecab/doc/index.html) Japanese part of speech and morphological analyser. Formatting information is inserted into the set of components in a post processing step (it is not done by MeCab). These components have a fake 'kind' assigned to them. Currently the following kinds of formatting components are handled by Bunpa:
|
14
|
+
* spaces (スペース)
|
15
|
+
* tabs (タブ)
|
16
|
+
* newlines (改行)
|
17
|
+
|
18
|
+
Any components that cannot be identified by either MeCab or Bunpa are marked as unknown (未知).
|
19
|
+
|
20
|
+
## Installation
|
21
|
+
|
22
|
+
From within your Rails application's base directory:
|
23
|
+
|
24
|
+
1. Edit your Gemfile and add:
|
25
|
+
|
26
|
+
gem 'bunpa'
|
27
|
+
|
28
|
+
2. Install the gem:
|
29
|
+
|
30
|
+
bundle
|
31
|
+
|
32
|
+
## Usage
|
33
|
+
|
34
|
+
Bunpa operates as a very simple parser. It returns the components it identifies in an Enumerator in the same order as they appear in the document.
|
35
|
+
|
36
|
+
Basic usage is as follows:
|
37
|
+
|
38
|
+
```
|
39
|
+
require 'bunpa'
|
40
|
+
|
41
|
+
# Create the parser
|
42
|
+
parser = Bunpa::JapaneseTextParser.new
|
43
|
+
|
44
|
+
# Get an enumerable of Bunpa::Text::Components
|
45
|
+
components = parser.parse("こんにちは!お元気ですか。")
|
46
|
+
|
47
|
+
components.each do |component|
|
48
|
+
puts "#{component.text}\t(#{component.kind}"
|
49
|
+
end
|
50
|
+
|
51
|
+
```
|
52
|
+
|
53
|
+
For a slightly more detailed example, see the `usage_example.rb` script in the `bin` directory.
|
54
|
+
|
55
|
+
## Notes
|
56
|
+
|
57
|
+
This is very much a work in progress - it only has minimal testing at the moment, so use at your own risk :)
|
data/spec/parser_spec.rb
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Bunpa::JapaneseTextParser do
|
4
|
+
describe "#parse" do
|
5
|
+
before(:each) do
|
6
|
+
@content = <<TEST
|
7
|
+
ほんと、悲し過ぎるわよね。 でも、しかたないな。
|
8
|
+
そうだよ!「Very sad」 だもん。
|
9
|
+
TEST
|
10
|
+
@parser = Bunpa::JapaneseTextParser.new
|
11
|
+
end
|
12
|
+
|
13
|
+
it "converts a Japanese text string into a list of components of different types (grammar, formatting, etc.) in the same order as they appear in the string" do
|
14
|
+
expect(@parser.parse(@content).map { |c| c.text }.join).to eq(@content)
|
15
|
+
end
|
16
|
+
|
17
|
+
context "component categories" do
|
18
|
+
it "marks spaces as component kind スペース" do
|
19
|
+
expect(@parser.parse(" ").first.kind).to eq("スペース")
|
20
|
+
end
|
21
|
+
|
22
|
+
it "marks newlines as component kind 改行" do
|
23
|
+
expect(@parser.parse("\n").first.kind).to eq("改行")
|
24
|
+
end
|
25
|
+
|
26
|
+
it "marks tabs as component kind タブ" do
|
27
|
+
expect(@parser.parse("\t").first.kind).to eq("タブ")
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
require 'bundler/setup'
|
2
|
+
Bundler.setup
|
3
|
+
|
4
|
+
require 'bunpa'
|
5
|
+
|
6
|
+
RSpec.configure do |config|
|
7
|
+
# ## Mock Framework
|
8
|
+
#
|
9
|
+
# If you prefer to use mocha, flexmock or RR, uncomment the appropriate line:
|
10
|
+
#
|
11
|
+
# config.mock_with :mocha
|
12
|
+
# config.mock_with :flexmock
|
13
|
+
# config.mock_with :rr
|
14
|
+
|
15
|
+
# Run specs in random order to surface order dependencies. If you find an
|
16
|
+
# order dependency and want to debug it, you can fix the order by providing
|
17
|
+
# the seed, which is printed after each run.
|
18
|
+
# --seed 1234
|
19
|
+
config.order = "random"
|
20
|
+
end
|
metadata
ADDED
@@ -0,0 +1,78 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: bunpa
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.2.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Daniel Carter
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-06-28 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: mecab
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0.996'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0.996'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rspec
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
description: A simple wrapper around the MeCab Japanese grammar parser than maintains
|
42
|
+
formatting.
|
43
|
+
email: clownba0t@gmail.com
|
44
|
+
executables: []
|
45
|
+
extensions: []
|
46
|
+
extra_rdoc_files: []
|
47
|
+
files:
|
48
|
+
- README.md
|
49
|
+
- spec/parser_spec.rb
|
50
|
+
- spec/spec_helper.rb
|
51
|
+
homepage: https://github.com/clownba0t/bunpa
|
52
|
+
licenses:
|
53
|
+
- MIT
|
54
|
+
metadata: {}
|
55
|
+
post_install_message:
|
56
|
+
rdoc_options: []
|
57
|
+
require_paths:
|
58
|
+
- lib
|
59
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
60
|
+
requirements:
|
61
|
+
- - ">="
|
62
|
+
- !ruby/object:Gem::Version
|
63
|
+
version: '0'
|
64
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
requirements: []
|
70
|
+
rubyforge_project:
|
71
|
+
rubygems_version: 2.3.0
|
72
|
+
signing_key:
|
73
|
+
specification_version: 4
|
74
|
+
summary: bunpa v0.2.0
|
75
|
+
test_files:
|
76
|
+
- spec/spec_helper.rb
|
77
|
+
- spec/parser_spec.rb
|
78
|
+
has_rdoc:
|