bunpa 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +57 -0
- data/spec/parser_spec.rb +31 -0
- data/spec/spec_helper.rb +20 -0
- metadata +78 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: b68a4b60aa085901f9797f5beea784f11ac322ac
|
4
|
+
data.tar.gz: 08ab437798631a5f11b0a8d6b4311345152d31c7
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 1ae1d695acfb725315d2973409f2f0cb8d9c4d4951258a779c90d1e960c42bfb4f67eb4a78c719cf90ad00f3146fde10edac7fc63402ac52fa7b07e22472096e
|
7
|
+
data.tar.gz: 6c2715bfba32f2c9eac30f07de1f03d868ed1a1d45158f8b46b40161a31fcc00d6e66e1149bcdc84a028e69c60af8dc14d1e592aedf1bd36a08d6204d46119a8
|
data/README.md
ADDED
@@ -0,0 +1,57 @@
|
|
1
|
+
Bunpa
|
2
|
+
==========================
|
3
|
+
|
4
|
+
Bunpa is an extremely simple wrapper around the MeCab Japanese grammar parser. It was designed with two key features in mind:
|
5
|
+
|
6
|
+
1. Simplicity - only returns the text and major part of speech for each component
|
7
|
+
2. Completeness - ensure that whitespace and any unknown characters are preserved
|
8
|
+
|
9
|
+
## Background
|
10
|
+
|
11
|
+
Bunpa parses Japanese text into a set of ordered components. Each component represents either a part of speech (noun, verb, etc.) or formatting (whitespace, etc.) All components have a text value (exactly as they appear in the text provided) and kind (usually part of speech).
|
12
|
+
|
13
|
+
All grammatical information is provided by the excellent [MeCab](http://mecab.googlecode.com/svn/trunk/mecab/doc/index.html) Japanese part of speech and morphological analyser. Formatting information is inserted into the set of components in a post processing step (it is not done by MeCab). These components have a fake 'kind' assigned to them. Currently the following kinds of formatting components are handled by Bunpa:
|
14
|
+
* spaces (スペース)
|
15
|
+
* tabs (タブ)
|
16
|
+
* newlines (改行)
|
17
|
+
|
18
|
+
Any components that cannot be identified by either MeCab or Bunpa are marked as unknown (未知).
|
19
|
+
|
20
|
+
## Installation
|
21
|
+
|
22
|
+
From within your Rails application's base directory:
|
23
|
+
|
24
|
+
1. Edit your Gemfile and add:
|
25
|
+
|
26
|
+
gem 'bunpa'
|
27
|
+
|
28
|
+
2. Install the gem:
|
29
|
+
|
30
|
+
bundle
|
31
|
+
|
32
|
+
## Usage
|
33
|
+
|
34
|
+
Bunpa operates as a very simple parser. It returns the components it identifies in an Enumerator in the same order as they appear in the document.
|
35
|
+
|
36
|
+
Basic usage is as follows:
|
37
|
+
|
38
|
+
```
|
39
|
+
require 'bunpa'
|
40
|
+
|
41
|
+
# Create the parser
|
42
|
+
parser = Bunpa::JapaneseTextParser.new
|
43
|
+
|
44
|
+
# Get an enumerable of Bunpa::Text::Components
|
45
|
+
components = parser.parse("こんにちは!お元気ですか。")
|
46
|
+
|
47
|
+
components.each do |component|
|
48
|
+
puts "#{component.text}\t(#{component.kind}"
|
49
|
+
end
|
50
|
+
|
51
|
+
```
|
52
|
+
|
53
|
+
For a slightly more detailed example, see the `usage_example.rb` script in the `bin` directory.
|
54
|
+
|
55
|
+
## Notes
|
56
|
+
|
57
|
+
This is very much a work in progress - it only has minimal testing at the moment, so use at your own risk :)
|
data/spec/parser_spec.rb
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Bunpa::JapaneseTextParser do
|
4
|
+
describe "#parse" do
|
5
|
+
before(:each) do
|
6
|
+
@content = <<TEST
|
7
|
+
ほんと、悲し過ぎるわよね。 でも、しかたないな。
|
8
|
+
そうだよ!「Very sad」 だもん。
|
9
|
+
TEST
|
10
|
+
@parser = Bunpa::JapaneseTextParser.new
|
11
|
+
end
|
12
|
+
|
13
|
+
it "converts a Japanese text string into a list of components of different types (grammar, formatting, etc.) in the same order as they appear in the string" do
|
14
|
+
expect(@parser.parse(@content).map { |c| c.text }.join).to eq(@content)
|
15
|
+
end
|
16
|
+
|
17
|
+
context "component categories" do
|
18
|
+
it "marks spaces as component kind スペース" do
|
19
|
+
expect(@parser.parse(" ").first.kind).to eq("スペース")
|
20
|
+
end
|
21
|
+
|
22
|
+
it "marks newlines as component kind 改行" do
|
23
|
+
expect(@parser.parse("\n").first.kind).to eq("改行")
|
24
|
+
end
|
25
|
+
|
26
|
+
it "marks tabs as component kind タブ" do
|
27
|
+
expect(@parser.parse("\t").first.kind).to eq("タブ")
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
require 'bundler/setup'
|
2
|
+
Bundler.setup
|
3
|
+
|
4
|
+
require 'bunpa'
|
5
|
+
|
6
|
+
RSpec.configure do |config|
|
7
|
+
# ## Mock Framework
|
8
|
+
#
|
9
|
+
# If you prefer to use mocha, flexmock or RR, uncomment the appropriate line:
|
10
|
+
#
|
11
|
+
# config.mock_with :mocha
|
12
|
+
# config.mock_with :flexmock
|
13
|
+
# config.mock_with :rr
|
14
|
+
|
15
|
+
# Run specs in random order to surface order dependencies. If you find an
|
16
|
+
# order dependency and want to debug it, you can fix the order by providing
|
17
|
+
# the seed, which is printed after each run.
|
18
|
+
# --seed 1234
|
19
|
+
config.order = "random"
|
20
|
+
end
|
metadata
ADDED
@@ -0,0 +1,78 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: bunpa
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.2.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Daniel Carter
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-06-28 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: mecab
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0.996'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0.996'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rspec
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
description: A simple wrapper around the MeCab Japanese grammar parser than maintains
|
42
|
+
formatting.
|
43
|
+
email: clownba0t@gmail.com
|
44
|
+
executables: []
|
45
|
+
extensions: []
|
46
|
+
extra_rdoc_files: []
|
47
|
+
files:
|
48
|
+
- README.md
|
49
|
+
- spec/parser_spec.rb
|
50
|
+
- spec/spec_helper.rb
|
51
|
+
homepage: https://github.com/clownba0t/bunpa
|
52
|
+
licenses:
|
53
|
+
- MIT
|
54
|
+
metadata: {}
|
55
|
+
post_install_message:
|
56
|
+
rdoc_options: []
|
57
|
+
require_paths:
|
58
|
+
- lib
|
59
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
60
|
+
requirements:
|
61
|
+
- - ">="
|
62
|
+
- !ruby/object:Gem::Version
|
63
|
+
version: '0'
|
64
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
requirements: []
|
70
|
+
rubyforge_project:
|
71
|
+
rubygems_version: 2.3.0
|
72
|
+
signing_key:
|
73
|
+
specification_version: 4
|
74
|
+
summary: bunpa v0.2.0
|
75
|
+
test_files:
|
76
|
+
- spec/spec_helper.rb
|
77
|
+
- spec/parser_spec.rb
|
78
|
+
has_rdoc:
|