lorem_jp 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 3b5ed5259d27443978984fcdee63ad7f09d004d2
4
+ data.tar.gz: 4710492f6fc10793b6722337531a70eca870d232
5
+ SHA512:
6
+ metadata.gz: 6107f68c64a7c67f7cc33b6abb272c26fa7e3330025a1c3dae5c65dfb36388bd7d3debeef4255a380e611f3bb367d9988a700fa9423aeafbc50dc0dd78e74f27
7
+ data.tar.gz: 8ed7f6edf858bf1fe1dd4d5dfd2402bcfb2a78325eee9122407b2d50d79e3d095813bdc7d7a2e156dd0c20ae6a8f83c40750b6c64595e617b8e86aa0c3cac560
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gemspec
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 ITO Nobuaki
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,57 @@
1
+ # LoremJP
2
+
3
+ Japanese Lorem Ipsum generator.
4
+
5
+ ## Usage
6
+
7
+ ```ruby
8
+ # Singleton API
9
+ puts LoremJP.sentence # => output meaningless Japanese sentence
10
+
11
+ # Or create an instance and re-use it
12
+ generator = LoremJP.new
13
+ generator.sentence # => ...
14
+ generator.sentence # => ...
15
+ ```
16
+
17
+ Command line tool `lorem_jp` is also available.
18
+
19
+ $ lorem_jp
20
+ blah blah blah ...
21
+
22
+ ## Installation
23
+
24
+ Add this line to your application's Gemfile:
25
+
26
+ gem 'lorem_jp', :github => 'dayflower/lorem_jp'
27
+
28
+ And then execute:
29
+
30
+ $ bundle
31
+
32
+ Or install it yourself as:
33
+
34
+ $ gem install lorem_jp
35
+
36
+ ## Notice
37
+
38
+ Default dictionary is assembled from out-of-copyright texts provided by [Aozora Bunko](http://www.aozora.gr.jp/index.html).
39
+
40
+ * "[Chawan no yu](http://www.aozora.gr.jp/cards/000042/card2363.html)"
41
+ by "[Torahiko Terada](http://www.aozora.gr.jp/index_pages/person42.html)"
42
+ * "[Akai fune no okyaku](http://www.aozora.gr.jp/cards/001475/card52960.html)"
43
+ by "[Mimei Ogawa](http://www.aozora.gr.jp/index_pages/person1475.html)"
44
+
45
+ ## TODO
46
+
47
+ * write document for usage (in README)
48
+ * write document for building custom dictionary
49
+ * write more tests
50
+
51
+ ## Contributing
52
+
53
+ 1. Fork it
54
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
55
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
56
+ 4. Push to the branch (`git push origin my-new-feature`)
57
+ 5. Create new Pull Request
@@ -0,0 +1,7 @@
1
+ require 'bundler/gem_tasks'
2
+ require 'rake/testtask'
3
+
4
+ Rake::TestTask.new do |t|
5
+ t.pattern = 'test/**/*_spec.rb'
6
+ t.libs.push 'test'
7
+ end
@@ -0,0 +1,6 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'lorem_jp'
4
+ require 'lorem_jp/cli'
5
+
6
+ LoremJP::CLI.main
@@ -0,0 +1,248 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ require 'fileutils'
4
+ require 'stringio'
5
+ require 'open-uri'
6
+
7
+ module TextFilter
8
+ attr_accessor :next_filter
9
+
10
+ def input(line)
11
+ raise 'must be overriden'
12
+ end
13
+
14
+ def finish
15
+ if @next_filter
16
+ @next_filter.finish
17
+ end
18
+ end
19
+
20
+ private
21
+
22
+ def puts(line)
23
+ @next_filter.input(line)
24
+ end
25
+ end
26
+
27
+ class ResultCatcher < Array
28
+ include TextFilter
29
+
30
+ def input(line)
31
+ self << line
32
+ end
33
+ end
34
+
35
+ class FileOutput
36
+ include TextFilter
37
+
38
+ def initialize(filename)
39
+ @handle = open filename, 'w:utf-8'
40
+ end
41
+
42
+ def input(line)
43
+ @handle.write line
44
+ end
45
+
46
+ def finish
47
+ @handle.close
48
+
49
+ if @next_filter
50
+ @next_filter.finish
51
+ end
52
+ end
53
+ end
54
+
55
+ class TextFilterManager < Array
56
+ def setup
57
+ last = nil
58
+ (self.size - 1).downto(0) do |i|
59
+ self[i].next_filter = last
60
+ last = self[i]
61
+ end
62
+ end
63
+
64
+ def input(line)
65
+ self.first.input line
66
+ end
67
+
68
+ def finish
69
+ self.first.finish
70
+ end
71
+ end
72
+
73
+ class UTF8Converter
74
+ include TextFilter
75
+
76
+ def input(line)
77
+ puts line.encode('UTF-8')
78
+ end
79
+ end
80
+
81
+ class BlankTrimmer
82
+ include TextFilter
83
+
84
+ def input(line)
85
+ line.sub!(%r{^ +}xmo, '')
86
+
87
+ if line !~ %r{^\s*$}xmo
88
+ puts line
89
+ end
90
+ end
91
+ end
92
+
93
+ class AozoraTrimmer
94
+ include TextFilter
95
+
96
+ def input(line)
97
+ puts line.gsub(%r{|(\S+?)《.*?》}xmo, '\\1') \
98
+ .gsub(%r{《.*?》}xmo, '') \
99
+ .gsub(%r{[#.*?]}xmo, '')
100
+ end
101
+ end
102
+
103
+ class AozoraTrimHeader
104
+ include TextFilter
105
+
106
+ def initialize
107
+ @state = 0
108
+ end
109
+
110
+ def input(line)
111
+ case @state
112
+ when 2
113
+ puts line
114
+ when 0, 1
115
+ if line =~ %r{^----------}
116
+ @state += 1
117
+ end
118
+ end
119
+ end
120
+ end
121
+
122
+ class AozoraTrimTrailer
123
+ include TextFilter
124
+
125
+ def initialize
126
+ @state = 0
127
+ end
128
+
129
+ def input(line)
130
+ case @state
131
+ when 1
132
+ # pass
133
+ when 0
134
+ if line =~ %r{^底本:}
135
+ @state = 1
136
+ else
137
+ puts line
138
+ end
139
+ end
140
+ end
141
+ end
142
+
143
+ class StandardFetcher
144
+ class << self
145
+ DOWNLOAD_DIR = File.absolute_path('../download/', __FILE__)
146
+
147
+ def fetch(filename, url, options = {})
148
+ force = options[:force]
149
+
150
+ output_file = File.join(DOWNLOAD_DIR, filename)
151
+ if File.exists?(output_file) && ! force
152
+ return output_file
153
+ end
154
+
155
+ unless Dir.exists?(DOWNLOAD_DIR)
156
+ FileUtils.makedirs(DOWNLOAD_DIR)
157
+ end
158
+
159
+ begin
160
+ Kernel.open output_file, 'wb:ASCII-8BIT' do |file|
161
+ Kernel.open url, 'rb' do |net|
162
+ begin
163
+ loop do
164
+ buf = net.sysread(4096)
165
+ len = file.write buf
166
+ end
167
+ rescue EOFError
168
+ # pass
169
+ end
170
+ end
171
+ end
172
+
173
+ rescue
174
+ File.unlink output_file rescue nil
175
+ raise
176
+ end
177
+
178
+ return output_file
179
+ end
180
+
181
+ def extract(archive, target, options = { :external_encoding => 'CP932' })
182
+ if ! File.exists?(archive)
183
+ raise
184
+ end
185
+
186
+ cmdline = "unzip -xqc #{archive} #{target} 2>/dev/null"
187
+ result = StringIO.new
188
+
189
+ IO.popen(cmdline, 'r', options) { |io|
190
+ loop do
191
+ line = io.gets
192
+ break if line.nil?
193
+ result.write line
194
+ end
195
+ }
196
+ if $?.exitstatus !=0
197
+ raise "#{$?}"
198
+ end
199
+
200
+ result.rewind
201
+
202
+ return result
203
+ end
204
+ end
205
+ end
206
+
207
+ class AozoraFetcher < StandardFetcher
208
+ class << self
209
+ TEXT_DIR = File.absolute_path('../text/', __FILE__)
210
+
211
+ def run(args = {})
212
+ @output_file = File.join(TEXT_DIR, args[:output])
213
+ if File.exists?(@output_file) && ! args[:force]
214
+ return @output_file
215
+ end
216
+
217
+ archive = fetch(args[:archive_name], args[:url])
218
+
219
+ unless Dir.exists?(TEXT_DIR)
220
+ FileUtils.makedirs(TEXT_DIR)
221
+ end
222
+
223
+ manager = create_manager
224
+
225
+ source = extract(archive, args[:source])
226
+ source.each do |line|
227
+ manager.input line
228
+ end
229
+
230
+ manager.finish
231
+
232
+ return @output_file
233
+ end
234
+
235
+ def create_manager
236
+ manager = TextFilterManager.new
237
+ manager << UTF8Converter.new
238
+ manager << AozoraTrimHeader.new
239
+ manager << AozoraTrimTrailer.new
240
+ manager << BlankTrimmer.new
241
+ manager << AozoraTrimmer.new
242
+ manager << FileOutput.new(@output_file)
243
+ manager.setup
244
+
245
+ return manager
246
+ end
247
+ end
248
+ end
@@ -0,0 +1,183 @@
1
+ # -*- coding: utf-8 -*-
2
+ require 'open3'
3
+ require 'optparse'
4
+
5
+ class MarkovCalculator
6
+ def initialize(options = {})
7
+ @chain = options[:chain] || 1
8
+ @ignore_type = options[:ignore_type]
9
+
10
+ # word dictionary
11
+ @dict = { '' => { :id => 0, :word => '', :next => [] } }
12
+ @word_id = 1
13
+
14
+ # probability (candidates)
15
+ @tree = {}
16
+
17
+ clear_stack
18
+ end
19
+
20
+ def input_line(line)
21
+ line = line.chomp
22
+ return if line == 'EOS'
23
+
24
+ word, type = line.split(%r{\s+})
25
+
26
+ if word == '」'
27
+ sentence_is_terminated
28
+ return
29
+ end
30
+
31
+ return if word == '「' || word == '」'
32
+
33
+ if @ignore_type
34
+ line = word
35
+ end
36
+
37
+ # register word to dictionary
38
+ if @dict.has_key?(line)
39
+ id = @dict[line][:id]
40
+ else
41
+ id = @word_id
42
+ @word_id += 1
43
+ @dict[line] = { :id => id, :word => word }
44
+ end
45
+
46
+ # add to candidates
47
+ add_word_to_candidate id
48
+
49
+ @stack.shift
50
+ @stack << id
51
+
52
+ # termination
53
+ if %w[ 。 ? ! ].include?(word)
54
+ sentence_is_terminated
55
+ end
56
+
57
+ return
58
+ end
59
+
60
+ def input(lines)
61
+ lines.each do |line|
62
+ input_line(line.chomp)
63
+ end
64
+ end
65
+
66
+ def output_dictionary(handle)
67
+ # chains
68
+ handle.puts @chain.to_s
69
+
70
+ # word dictionary
71
+ output_words handle
72
+
73
+ # separator
74
+ handle.puts
75
+
76
+ # probabilities
77
+ output_tree handle
78
+ end
79
+
80
+ def output_words(handle)
81
+ @dict.values.sort_by { |item| item[:id] }.each do |item|
82
+ handle.puts item[:word]
83
+ end
84
+ end
85
+
86
+ def output_tree(handle)
87
+ output_tree_node(handle, @tree, 0)
88
+ end
89
+
90
+ private
91
+
92
+ def output_tree_node(handle, node, depth)
93
+ node.keys.sort.each do |key|
94
+ child = node[key]
95
+
96
+ handle.write %q{ } * depth
97
+ handle.write key
98
+
99
+ if child.has_key?(:cands)
100
+ handle.write "="
101
+
102
+ cands = child[:cands].sort
103
+ first = cands[0]
104
+ if cands.all? { |v| v == first }
105
+ cands = [ first ]
106
+ end
107
+
108
+ handle.puts cands.join(",")
109
+ else
110
+ handle.write "\n"
111
+
112
+ output_tree_node(handle, child, depth + 1)
113
+ end
114
+ end
115
+ end
116
+
117
+ def sentence_is_terminated
118
+ while @stack[0] != -1
119
+ add_word_to_candidate -1 # EOS
120
+
121
+ @stack.shift
122
+ @stack << -1
123
+ end
124
+
125
+ clear_stack
126
+ end
127
+
128
+ def add_word_to_candidate(word_id)
129
+ node = @tree
130
+ s = @stack.dup
131
+ while s.length > 0
132
+ wid = s.shift
133
+ node[wid] ||= {}
134
+ node = node[wid]
135
+ end
136
+
137
+ node[:cands] ||= []
138
+ node[:cands] << word_id
139
+ end
140
+
141
+ def clear_stack
142
+ @stack = [ 0 ] * @chain
143
+ end
144
+
145
+ class CLI
146
+ def self.main
147
+ chain = 1
148
+ ignore_type = false
149
+
150
+ opt = OptionParser.new
151
+
152
+ opt.on('-c CHAIN', 'chain of precedences (default: 1)') {
153
+ |v| chain = v.to_i
154
+ }
155
+ opt.on('-n', 'ignore a part of speech') {
156
+ |v| ignore_type = v
157
+ }
158
+
159
+ opt.parse! ARGV
160
+
161
+ calculator = MarkovCalculator.new(:chain => chain,
162
+ :ignore_type => ignore_type)
163
+
164
+ Open3.popen3('mecab -O simple') { |stdin, stdout, stderr, wait_thr|
165
+ Thread.fork {
166
+ ARGF.set_encoding 'utf-8:utf-8'
167
+ ARGF.each do |line|
168
+ stdin.puts line.gsub(%r{(?: ^ [\s ]+ | [\s ]+ $ )}xmo, '')
169
+ end
170
+ stdin.close
171
+ }
172
+
173
+ calculator.input(stdout)
174
+ }
175
+
176
+ calculator.output_dictionary(STDOUT)
177
+ end
178
+ end
179
+ end
180
+
181
+ if __FILE__ == $0
182
+ MarkovCalculator::CLI.main
183
+ end