bunpa 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: b68a4b60aa085901f9797f5beea784f11ac322ac
4
+ data.tar.gz: 08ab437798631a5f11b0a8d6b4311345152d31c7
5
+ SHA512:
6
+ metadata.gz: 1ae1d695acfb725315d2973409f2f0cb8d9c4d4951258a779c90d1e960c42bfb4f67eb4a78c719cf90ad00f3146fde10edac7fc63402ac52fa7b07e22472096e
7
+ data.tar.gz: 6c2715bfba32f2c9eac30f07de1f03d868ed1a1d45158f8b46b40161a31fcc00d6e66e1149bcdc84a028e69c60af8dc14d1e592aedf1bd36a08d6204d46119a8
data/README.md ADDED
@@ -0,0 +1,57 @@
1
+ Bunpa
2
+ ==========================
3
+
4
+ Bunpa is an extremely simple wrapper around the MeCab Japanese grammar parser. It was designed with two key features in mind:
5
+
6
+ 1. Simplicity - only returns the text and major part of speech for each component
7
+ 2. Completeness - ensure that whitespace and any unknown characters are preserved
8
+
9
+ ## Background
10
+
11
+ Bunpa parses Japanese text into a set of ordered components. Each component represents either a part of speech (noun, verb, etc.) or formatting (whitespace, etc.) All components have a text value (exactly as they appear in the text provided) and kind (usually part of speech).
12
+
13
+ All grammatical information is provided by the excellent [MeCab](http://mecab.googlecode.com/svn/trunk/mecab/doc/index.html) Japanese part of speech and morphological analyser. Formatting information is inserted into the set of components in a post processing step (it is not done by MeCab). These components have a fake 'kind' assigned to them. Currently the following kinds of formatting components are handled by Bunpa:
14
+ * spaces (スペース)
15
+ * tabs (タブ)
16
+ * newlines (改行)
17
+
18
+ Any components that cannot be identified by either MeCab or Bunpa are marked as unknown (未知).
19
+
20
+ ## Installation
21
+
22
+ From within your Rails application's base directory:
23
+
24
+ 1. Edit your Gemfile and add:
25
+
26
+ gem 'bunpa'
27
+
28
+ 2. Install the gem:
29
+
30
+ bundle
31
+
32
+ ## Usage
33
+
34
+ Bunpa operates as a very simple parser. It returns the components it identifies in an Enumerator in the same order as they appear in the document.
35
+
36
+ Basic usage is as follows:
37
+
38
+ ```
39
+ require 'bunpa'
40
+
41
+ # Create the parser
42
+ parser = Bunpa::JapaneseTextParser.new
43
+
44
+ # Get an enumerable of Bunpa::Text::Components
45
+ components = parser.parse("こんにちは!お元気ですか。")
46
+
47
+ components.each do |component|
48
+ puts "#{component.text}\t(#{component.kind}"
49
+ end
50
+
51
+ ```
52
+
53
+ For a slightly more detailed example, see the `usage_example.rb` script in the `bin` directory.
54
+
55
+ ## Notes
56
+
57
+ This is very much a work in progress - it only has minimal testing at the moment, so use at your own risk :)
@@ -0,0 +1,31 @@
1
+ require 'spec_helper'
2
+
3
+ describe Bunpa::JapaneseTextParser do
4
+ describe "#parse" do
5
+ before(:each) do
6
+ @content = <<TEST
7
+ ほんと、悲し過ぎるわよね。 でも、しかたないな。
8
+ そうだよ!「Very sad」 だもん。
9
+ TEST
10
+ @parser = Bunpa::JapaneseTextParser.new
11
+ end
12
+
13
+ it "converts a Japanese text string into a list of components of different types (grammar, formatting, etc.) in the same order as they appear in the string" do
14
+ expect(@parser.parse(@content).map { |c| c.text }.join).to eq(@content)
15
+ end
16
+
17
+ context "component categories" do
18
+ it "marks spaces as component kind スペース" do
19
+ expect(@parser.parse(" ").first.kind).to eq("スペース")
20
+ end
21
+
22
+ it "marks newlines as component kind 改行" do
23
+ expect(@parser.parse("\n").first.kind).to eq("改行")
24
+ end
25
+
26
+ it "marks tabs as component kind タブ" do
27
+ expect(@parser.parse("\t").first.kind).to eq("タブ")
28
+ end
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,20 @@
1
+ require 'bundler/setup'
2
+ Bundler.setup
3
+
4
+ require 'bunpa'
5
+
6
+ RSpec.configure do |config|
7
+ # ## Mock Framework
8
+ #
9
+ # If you prefer to use mocha, flexmock or RR, uncomment the appropriate line:
10
+ #
11
+ # config.mock_with :mocha
12
+ # config.mock_with :flexmock
13
+ # config.mock_with :rr
14
+
15
+ # Run specs in random order to surface order dependencies. If you find an
16
+ # order dependency and want to debug it, you can fix the order by providing
17
+ # the seed, which is printed after each run.
18
+ # --seed 1234
19
+ config.order = "random"
20
+ end
metadata ADDED
@@ -0,0 +1,78 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: bunpa
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.2.0
5
+ platform: ruby
6
+ authors:
7
+ - Daniel Carter
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-06-28 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: mecab
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '0.996'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '0.996'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rspec
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ description: A simple wrapper around the MeCab Japanese grammar parser than maintains
42
+ formatting.
43
+ email: clownba0t@gmail.com
44
+ executables: []
45
+ extensions: []
46
+ extra_rdoc_files: []
47
+ files:
48
+ - README.md
49
+ - spec/parser_spec.rb
50
+ - spec/spec_helper.rb
51
+ homepage: https://github.com/clownba0t/bunpa
52
+ licenses:
53
+ - MIT
54
+ metadata: {}
55
+ post_install_message:
56
+ rdoc_options: []
57
+ require_paths:
58
+ - lib
59
+ required_ruby_version: !ruby/object:Gem::Requirement
60
+ requirements:
61
+ - - ">="
62
+ - !ruby/object:Gem::Version
63
+ version: '0'
64
+ required_rubygems_version: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ requirements: []
70
+ rubyforge_project:
71
+ rubygems_version: 2.3.0
72
+ signing_key:
73
+ specification_version: 4
74
+ summary: bunpa v0.2.0
75
+ test_files:
76
+ - spec/spec_helper.rb
77
+ - spec/parser_spec.rb
78
+ has_rdoc: