lorem_jp 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 3b5ed5259d27443978984fcdee63ad7f09d004d2
4
+ data.tar.gz: 4710492f6fc10793b6722337531a70eca870d232
5
+ SHA512:
6
+ metadata.gz: 6107f68c64a7c67f7cc33b6abb272c26fa7e3330025a1c3dae5c65dfb36388bd7d3debeef4255a380e611f3bb367d9988a700fa9423aeafbc50dc0dd78e74f27
7
+ data.tar.gz: 8ed7f6edf858bf1fe1dd4d5dfd2402bcfb2a78325eee9122407b2d50d79e3d095813bdc7d7a2e156dd0c20ae6a8f83c40750b6c64595e617b8e86aa0c3cac560
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gemspec
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 ITO Nobuaki
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,57 @@
1
+ # LoremJP
2
+
3
+ Japanese Lorem Ipsum generator.
4
+
5
+ ## Usage
6
+
7
+ ```ruby
8
+ # Singleton API
9
+ puts LoremJP.sentence # => output meaningless Japanese sentence
10
+
11
+ # Or create an instance and re-use it
12
+ generator = LoremJP.new
13
+ generator.sentence # => ...
14
+ generator.sentence # => ...
15
+ ```
16
+
17
+ Command line tool `lorem_jp` is also available.
18
+
19
+ $ lorem_jp
20
+ blah blah blah ...
21
+
22
+ ## Installation
23
+
24
+ Add this line to your application's Gemfile:
25
+
26
+ gem 'lorem_jp', :github => 'dayflower/lorem_jp'
27
+
28
+ And then execute:
29
+
30
+ $ bundle
31
+
32
+ Or install it yourself as:
33
+
34
+ $ gem install lorem_jp
35
+
36
+ ## Notice
37
+
38
+ Default dictionary is assembled from out-of-copyright texts provided by [Aozora Bunko](http://www.aozora.gr.jp/index.html).
39
+
40
+ * "[Chawan no yu](http://www.aozora.gr.jp/cards/000042/card2363.html)"
41
+ by "[Torahiko Terada](http://www.aozora.gr.jp/index_pages/person42.html)"
42
+ * "[Akai fune no okyaku](http://www.aozora.gr.jp/cards/001475/card52960.html)"
43
+ by "[Mimei Ogawa](http://www.aozora.gr.jp/index_pages/person1475.html)"
44
+
45
+ ## TODO
46
+
47
+ * write document for usage (in README)
48
+ * write document for building custom dictionary
49
+ * write more tests
50
+
51
+ ## Contributing
52
+
53
+ 1. Fork it
54
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
55
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
56
+ 4. Push to the branch (`git push origin my-new-feature`)
57
+ 5. Create new Pull Request
@@ -0,0 +1,7 @@
1
+ require 'bundler/gem_tasks'
2
+ require 'rake/testtask'
3
+
4
+ Rake::TestTask.new do |t|
5
+ t.pattern = 'test/**/*_spec.rb'
6
+ t.libs.push 'test'
7
+ end
@@ -0,0 +1,6 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'lorem_jp'
4
+ require 'lorem_jp/cli'
5
+
6
+ LoremJP::CLI.main
@@ -0,0 +1,248 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ require 'fileutils'
4
+ require 'stringio'
5
+ require 'open-uri'
6
+
7
+ module TextFilter
8
+ attr_accessor :next_filter
9
+
10
+ def input(line)
11
+ raise 'must be overriden'
12
+ end
13
+
14
+ def finish
15
+ if @next_filter
16
+ @next_filter.finish
17
+ end
18
+ end
19
+
20
+ private
21
+
22
+ def puts(line)
23
+ @next_filter.input(line)
24
+ end
25
+ end
26
+
27
+ class ResultCatcher < Array
28
+ include TextFilter
29
+
30
+ def input(line)
31
+ self << line
32
+ end
33
+ end
34
+
35
+ class FileOutput
36
+ include TextFilter
37
+
38
+ def initialize(filename)
39
+ @handle = open filename, 'w:utf-8'
40
+ end
41
+
42
+ def input(line)
43
+ @handle.write line
44
+ end
45
+
46
+ def finish
47
+ @handle.close
48
+
49
+ if @next_filter
50
+ @next_filter.finish
51
+ end
52
+ end
53
+ end
54
+
55
+ class TextFilterManager < Array
56
+ def setup
57
+ last = nil
58
+ (self.size - 1).downto(0) do |i|
59
+ self[i].next_filter = last
60
+ last = self[i]
61
+ end
62
+ end
63
+
64
+ def input(line)
65
+ self.first.input line
66
+ end
67
+
68
+ def finish
69
+ self.first.finish
70
+ end
71
+ end
72
+
73
+ class UTF8Converter
74
+ include TextFilter
75
+
76
+ def input(line)
77
+ puts line.encode('UTF-8')
78
+ end
79
+ end
80
+
81
+ class BlankTrimmer
82
+ include TextFilter
83
+
84
+ def input(line)
85
+ line.sub!(%r{^ +}xmo, '')
86
+
87
+ if line !~ %r{^\s*$}xmo
88
+ puts line
89
+ end
90
+ end
91
+ end
92
+
93
+ class AozoraTrimmer
94
+ include TextFilter
95
+
96
+ def input(line)
97
+ puts line.gsub(%r{|(\S+?)《.*?》}xmo, '\\1') \
98
+ .gsub(%r{《.*?》}xmo, '') \
99
+ .gsub(%r{[#.*?]}xmo, '')
100
+ end
101
+ end
102
+
103
+ class AozoraTrimHeader
104
+ include TextFilter
105
+
106
+ def initialize
107
+ @state = 0
108
+ end
109
+
110
+ def input(line)
111
+ case @state
112
+ when 2
113
+ puts line
114
+ when 0, 1
115
+ if line =~ %r{^----------}
116
+ @state += 1
117
+ end
118
+ end
119
+ end
120
+ end
121
+
122
+ class AozoraTrimTrailer
123
+ include TextFilter
124
+
125
+ def initialize
126
+ @state = 0
127
+ end
128
+
129
+ def input(line)
130
+ case @state
131
+ when 1
132
+ # pass
133
+ when 0
134
+ if line =~ %r{^底本:}
135
+ @state = 1
136
+ else
137
+ puts line
138
+ end
139
+ end
140
+ end
141
+ end
142
+
143
+ class StandardFetcher
144
+ class << self
145
+ DOWNLOAD_DIR = File.absolute_path('../download/', __FILE__)
146
+
147
+ def fetch(filename, url, options = {})
148
+ force = options[:force]
149
+
150
+ output_file = File.join(DOWNLOAD_DIR, filename)
151
+ if File.exists?(output_file) && ! force
152
+ return output_file
153
+ end
154
+
155
+ unless Dir.exists?(DOWNLOAD_DIR)
156
+ FileUtils.makedirs(DOWNLOAD_DIR)
157
+ end
158
+
159
+ begin
160
+ Kernel.open output_file, 'wb:ASCII-8BIT' do |file|
161
+ Kernel.open url, 'rb' do |net|
162
+ begin
163
+ loop do
164
+ buf = net.sysread(4096)
165
+ len = file.write buf
166
+ end
167
+ rescue EOFError
168
+ # pass
169
+ end
170
+ end
171
+ end
172
+
173
+ rescue
174
+ File.unlink output_file rescue nil
175
+ raise
176
+ end
177
+
178
+ return output_file
179
+ end
180
+
181
+ def extract(archive, target, options = { :external_encoding => 'CP932' })
182
+ if ! File.exists?(archive)
183
+ raise
184
+ end
185
+
186
+ cmdline = "unzip -xqc #{archive} #{target} 2>/dev/null"
187
+ result = StringIO.new
188
+
189
+ IO.popen(cmdline, 'r', options) { |io|
190
+ loop do
191
+ line = io.gets
192
+ break if line.nil?
193
+ result.write line
194
+ end
195
+ }
196
+ if $?.exitstatus !=0
197
+ raise "#{$?}"
198
+ end
199
+
200
+ result.rewind
201
+
202
+ return result
203
+ end
204
+ end
205
+ end
206
+
207
+ class AozoraFetcher < StandardFetcher
208
+ class << self
209
+ TEXT_DIR = File.absolute_path('../text/', __FILE__)
210
+
211
+ def run(args = {})
212
+ @output_file = File.join(TEXT_DIR, args[:output])
213
+ if File.exists?(@output_file) && ! args[:force]
214
+ return @output_file
215
+ end
216
+
217
+ archive = fetch(args[:archive_name], args[:url])
218
+
219
+ unless Dir.exists?(TEXT_DIR)
220
+ FileUtils.makedirs(TEXT_DIR)
221
+ end
222
+
223
+ manager = create_manager
224
+
225
+ source = extract(archive, args[:source])
226
+ source.each do |line|
227
+ manager.input line
228
+ end
229
+
230
+ manager.finish
231
+
232
+ return @output_file
233
+ end
234
+
235
+ def create_manager
236
+ manager = TextFilterManager.new
237
+ manager << UTF8Converter.new
238
+ manager << AozoraTrimHeader.new
239
+ manager << AozoraTrimTrailer.new
240
+ manager << BlankTrimmer.new
241
+ manager << AozoraTrimmer.new
242
+ manager << FileOutput.new(@output_file)
243
+ manager.setup
244
+
245
+ return manager
246
+ end
247
+ end
248
+ end
@@ -0,0 +1,183 @@
1
+ # -*- coding: utf-8 -*-
2
+ require 'open3'
3
+ require 'optparse'
4
+
5
+ class MarkovCalculator
6
+ def initialize(options = {})
7
+ @chain = options[:chain] || 1
8
+ @ignore_type = options[:ignore_type]
9
+
10
+ # word dictionary
11
+ @dict = { '' => { :id => 0, :word => '', :next => [] } }
12
+ @word_id = 1
13
+
14
+ # probability (candidates)
15
+ @tree = {}
16
+
17
+ clear_stack
18
+ end
19
+
20
+ def input_line(line)
21
+ line = line.chomp
22
+ return if line == 'EOS'
23
+
24
+ word, type = line.split(%r{\s+})
25
+
26
+ if word == '」'
27
+ sentence_is_terminated
28
+ return
29
+ end
30
+
31
+ return if word == '「' || word == '」'
32
+
33
+ if @ignore_type
34
+ line = word
35
+ end
36
+
37
+ # register word to dictionary
38
+ if @dict.has_key?(line)
39
+ id = @dict[line][:id]
40
+ else
41
+ id = @word_id
42
+ @word_id += 1
43
+ @dict[line] = { :id => id, :word => word }
44
+ end
45
+
46
+ # add to candidates
47
+ add_word_to_candidate id
48
+
49
+ @stack.shift
50
+ @stack << id
51
+
52
+ # termination
53
+ if %w[ 。 ? ! ].include?(word)
54
+ sentence_is_terminated
55
+ end
56
+
57
+ return
58
+ end
59
+
60
+ def input(lines)
61
+ lines.each do |line|
62
+ input_line(line.chomp)
63
+ end
64
+ end
65
+
66
+ def output_dictionary(handle)
67
+ # chains
68
+ handle.puts @chain.to_s
69
+
70
+ # word dictionary
71
+ output_words handle
72
+
73
+ # separator
74
+ handle.puts
75
+
76
+ # probabilities
77
+ output_tree handle
78
+ end
79
+
80
+ def output_words(handle)
81
+ @dict.values.sort_by { |item| item[:id] }.each do |item|
82
+ handle.puts item[:word]
83
+ end
84
+ end
85
+
86
+ def output_tree(handle)
87
+ output_tree_node(handle, @tree, 0)
88
+ end
89
+
90
+ private
91
+
92
+ def output_tree_node(handle, node, depth)
93
+ node.keys.sort.each do |key|
94
+ child = node[key]
95
+
96
+ handle.write %q{ } * depth
97
+ handle.write key
98
+
99
+ if child.has_key?(:cands)
100
+ handle.write "="
101
+
102
+ cands = child[:cands].sort
103
+ first = cands[0]
104
+ if cands.all? { |v| v == first }
105
+ cands = [ first ]
106
+ end
107
+
108
+ handle.puts cands.join(",")
109
+ else
110
+ handle.write "\n"
111
+
112
+ output_tree_node(handle, child, depth + 1)
113
+ end
114
+ end
115
+ end
116
+
117
+ def sentence_is_terminated
118
+ while @stack[0] != -1
119
+ add_word_to_candidate -1 # EOS
120
+
121
+ @stack.shift
122
+ @stack << -1
123
+ end
124
+
125
+ clear_stack
126
+ end
127
+
128
+ def add_word_to_candidate(word_id)
129
+ node = @tree
130
+ s = @stack.dup
131
+ while s.length > 0
132
+ wid = s.shift
133
+ node[wid] ||= {}
134
+ node = node[wid]
135
+ end
136
+
137
+ node[:cands] ||= []
138
+ node[:cands] << word_id
139
+ end
140
+
141
+ def clear_stack
142
+ @stack = [ 0 ] * @chain
143
+ end
144
+
145
+ class CLI
146
+ def self.main
147
+ chain = 1
148
+ ignore_type = false
149
+
150
+ opt = OptionParser.new
151
+
152
+ opt.on('-c CHAIN', 'chain of precedences (default: 1)') {
153
+ |v| chain = v.to_i
154
+ }
155
+ opt.on('-n', 'ignore a part of speech') {
156
+ |v| ignore_type = v
157
+ }
158
+
159
+ opt.parse! ARGV
160
+
161
+ calculator = MarkovCalculator.new(:chain => chain,
162
+ :ignore_type => ignore_type)
163
+
164
+ Open3.popen3('mecab -O simple') { |stdin, stdout, stderr, wait_thr|
165
+ Thread.fork {
166
+ ARGF.set_encoding 'utf-8:utf-8'
167
+ ARGF.each do |line|
168
+ stdin.puts line.gsub(%r{(?: ^ [\s ]+ | [\s ]+ $ )}xmo, '')
169
+ end
170
+ stdin.close
171
+ }
172
+
173
+ calculator.input(stdout)
174
+ }
175
+
176
+ calculator.output_dictionary(STDOUT)
177
+ end
178
+ end
179
+ end
180
+
181
+ if __FILE__ == $0
182
+ MarkovCalculator::CLI.main
183
+ end