RubyGems - lorem_jp - Versions diffs - 0.0.1 - Mend

lorem_jp 0.0.1

Files changed (17) hide show

checksums.yaml ADDED

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: 3b5ed5259d27443978984fcdee63ad7f09d004d2
+  data.tar.gz: 4710492f6fc10793b6722337531a70eca870d232
+SHA512:
+  metadata.gz: 6107f68c64a7c67f7cc33b6abb272c26fa7e3330025a1c3dae5c65dfb36388bd7d3debeef4255a380e611f3bb367d9988a700fa9423aeafbc50dc0dd78e74f27
+  data.tar.gz: 8ed7f6edf858bf1fe1dd4d5dfd2402bcfb2a78325eee9122407b2d50d79e3d095813bdc7d7a2e156dd0c20ae6a8f83c40750b6c64595e617b8e86aa0c3cac560

data/Gemfile ADDED

@@ -0,0 +1,3 @@
+source 'https://rubygems.org'
+gemspec

data/LICENSE.txt ADDED

@@ -0,0 +1,22 @@
+Copyright (c) 2013 ITO Nobuaki
+MIT License
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/README.md ADDED

@@ -0,0 +1,57 @@
+# LoremJP
+Japanese Lorem Ipsum generator.
+## Usage
+```ruby
+# Singleton API
+puts LoremJP.sentence   # => output meaningless Japanese sentence
+# Or create an instance and re-use it
+generator = LoremJP.new
+generator.sentence      # => ...
+generator.sentence      # => ...
+```
+Command line tool `lorem_jp` is also available.
+    $ lorem_jp
+    blah blah blah ...
+## Installation
+Add this line to your application's Gemfile:
+    gem 'lorem_jp', :github => 'dayflower/lorem_jp'
+And then execute:
+    $ bundle
+Or install it yourself as:
+    $ gem install lorem_jp
+## Notice
+Default dictionary is assembled from out-of-copyright texts provided by [Aozora Bunko](http://www.aozora.gr.jp/index.html).
+* "[Chawan no yu](http://www.aozora.gr.jp/cards/000042/card2363.html)"
+  by "[Torahiko Terada](http://www.aozora.gr.jp/index_pages/person42.html)"
+* "[Akai fune no okyaku](http://www.aozora.gr.jp/cards/001475/card52960.html)"
+  by "[Mimei Ogawa](http://www.aozora.gr.jp/index_pages/person1475.html)"
+## TODO
+* write document for usage (in README)
+* write document for building custom dictionary
+* write more tests
+## Contributing
+1. Fork it
+2. Create your feature branch (`git checkout -b my-new-feature`)
+3. Commit your changes (`git commit -am 'Add some feature'`)
+4. Push to the branch (`git push origin my-new-feature`)
+5. Create new Pull Request

data/Rakefile ADDED

@@ -0,0 +1,7 @@
+require 'bundler/gem_tasks'
+require 'rake/testtask'
+Rake::TestTask.new do |t|
+  t.pattern = 'test/**/*_spec.rb'
+  t.libs.push 'test'
+end

data/bin/lorem_jp ADDED

@@ -0,0 +1,6 @@
+#!/usr/bin/env ruby
+require 'lorem_jp'
+require 'lorem_jp/cli'
+LoremJP::CLI.main

data/build/fetcher.rb ADDED

@@ -0,0 +1,248 @@
+# -*- coding: utf-8 -*-
+require 'fileutils'
+require 'stringio'
+require 'open-uri'
+module TextFilter
+  attr_accessor :next_filter
+  def input(line)
+    raise 'must be overriden'
+  end
+  def finish
+    if @next_filter
+      @next_filter.finish
+    end
+  end
+  private
+  def puts(line)
+    @next_filter.input(line)
+  end
+end
+class ResultCatcher < Array
+  include TextFilter
+  def input(line)
+    self << line
+  end
+end
+class FileOutput
+  include TextFilter
+  def initialize(filename)
+    @handle = open filename, 'w:utf-8'
+  end
+  def input(line)
+    @handle.write line
+  end
+  def finish
+    @handle.close
+    if @next_filter
+      @next_filter.finish
+    end
+  end
+end
+class TextFilterManager < Array
+  def setup
+    last = nil
+    (self.size - 1).downto(0) do |i|
+      self[i].next_filter = last
+      last = self[i]
+    end
+  end
+  def input(line)
+    self.first.input line
+  end
+  def finish
+    self.first.finish
+  end
+end
+class UTF8Converter
+  include TextFilter
+  def input(line)
+    puts line.encode('UTF-8')
+  end
+end
+class BlankTrimmer
+  include TextFilter
+  def input(line)
+    line.sub!(%r{^　+}xmo, '')
+    if line !~ %r{^\s*$}xmo
+      puts line
+    end
+  end
+end
+class AozoraTrimmer
+  include TextFilter
+  def input(line)
+    puts line.gsub(%r{｜(\S+?)《.*?》}xmo, '\\1') \
+             .gsub(%r{《.*?》}xmo, '') \
+             .gsub(%r{［＃.*?］}xmo, '')
+  end
+end
+class AozoraTrimHeader
+  include TextFilter
+  def initialize
+    @state = 0
+  end
+  def input(line)
+    case @state
+    when 2
+      puts line
+    when 0, 1
+      if line =~ %r{^----------}
+        @state += 1
+      end
+    end
+  end
+end
+class AozoraTrimTrailer
+  include TextFilter
+  def initialize
+    @state = 0
+  end
+  def input(line)
+    case @state
+    when 1
+      # pass
+    when 0
+      if line =~ %r{^底本：}
+        @state = 1
+      else
+        puts line
+      end
+    end
+  end
+end
+class StandardFetcher
+  class << self
+    DOWNLOAD_DIR = File.absolute_path('../download/', __FILE__)
+    def fetch(filename, url, options = {})
+      force = options[:force]
+      output_file = File.join(DOWNLOAD_DIR, filename)
+      if File.exists?(output_file) && ! force
+        return output_file
+      end
+      unless Dir.exists?(DOWNLOAD_DIR)
+        FileUtils.makedirs(DOWNLOAD_DIR)
+      end
+      begin
+        Kernel.open output_file, 'wb:ASCII-8BIT' do |file|
+          Kernel.open url, 'rb' do |net|
+            begin
+              loop do
+                buf = net.sysread(4096)
+                len = file.write buf
+              end
+            rescue EOFError
+              # pass
+            end
+          end
+        end
+      rescue
+        File.unlink output_file rescue nil
+        raise
+      end
+      return output_file
+    end
+    def extract(archive, target, options = { :external_encoding => 'CP932' })
+      if ! File.exists?(archive)
+        raise
+      end
+      cmdline = "unzip -xqc #{archive} #{target} 2>/dev/null"
+      result  = StringIO.new
+      IO.popen(cmdline, 'r', options) { |io|
+        loop do
+          line = io.gets
+          break if line.nil?
+          result.write line
+        end
+      }
+      if $?.exitstatus !=0
+        raise "#{$?}"
+      end
+      result.rewind
+      return result
+    end
+  end
+end
+class AozoraFetcher < StandardFetcher
+  class << self
+    TEXT_DIR = File.absolute_path('../text/', __FILE__)
+    def run(args = {})
+      @output_file = File.join(TEXT_DIR, args[:output])
+      if File.exists?(@output_file) && ! args[:force]
+        return @output_file
+      end
+      archive = fetch(args[:archive_name], args[:url])
+      unless Dir.exists?(TEXT_DIR)
+        FileUtils.makedirs(TEXT_DIR)
+      end
+      manager = create_manager
+      source = extract(archive, args[:source])
+      source.each do |line|
+        manager.input line
+      end
+      manager.finish
+      return @output_file
+    end
+    def create_manager
+      manager = TextFilterManager.new
+      manager << UTF8Converter.new
+      manager << AozoraTrimHeader.new
+      manager << AozoraTrimTrailer.new
+      manager << BlankTrimmer.new
+      manager << AozoraTrimmer.new
+      manager << FileOutput.new(@output_file)
+      manager.setup
+      return manager
+    end
+  end
+end

data/build/make_dict.rb ADDED

@@ -0,0 +1,183 @@
+# -*- coding: utf-8 -*-
+require 'open3'
+require 'optparse'
+class MarkovCalculator
+  def initialize(options = {})
+    @chain       = options[:chain] || 1
+    @ignore_type = options[:ignore_type]
+    # word dictionary
+    @dict = { '' => { :id => 0, :word => '', :next => [] } }
+    @word_id = 1
+    # probability (candidates)
+    @tree = {}
+    clear_stack
+  end
+  def input_line(line)
+    line = line.chomp
+    return if line == 'EOS'
+    word, type = line.split(%r{\s+})
+    if word == '」'
+      sentence_is_terminated
+      return
+    end
+    return if word == '「' || word == '」'
+    if @ignore_type
+      line = word
+    end
+    # register word to dictionary
+    if @dict.has_key?(line)
+      id = @dict[line][:id]
+    else
+      id = @word_id
+      @word_id += 1
+      @dict[line] = { :id => id, :word => word }
+    end
+    # add to candidates
+    add_word_to_candidate id
+    @stack.shift
+    @stack << id
+    # termination
+    if %w[ 。 ？ ！ ].include?(word)
+      sentence_is_terminated
+    end
+    return
+  end
+  def input(lines)
+    lines.each do |line|
+      input_line(line.chomp)
+    end
+  end
+  def output_dictionary(handle)
+    # chains
+    handle.puts @chain.to_s
+    # word dictionary
+    output_words handle
+    # separator
+    handle.puts
+    # probabilities
+    output_tree handle
+  end
+  def output_words(handle)
+    @dict.values.sort_by { |item| item[:id] }.each do |item|
+      handle.puts item[:word]
+    end
+  end
+  def output_tree(handle)
+    output_tree_node(handle, @tree, 0)
+  end
+  private
+  def output_tree_node(handle, node, depth)
+    node.keys.sort.each do |key|
+      child = node[key]
+      handle.write %q{ } * depth
+      handle.write key
+      if child.has_key?(:cands)
+        handle.write "="
+        cands = child[:cands].sort
+        first = cands[0]
+        if cands.all? { |v| v == first }
+          cands = [ first ]
+        end
+        handle.puts cands.join(",")
+      else
+        handle.write "\n"
+        output_tree_node(handle, child, depth + 1)
+      end
+    end
+  end
+  def sentence_is_terminated
+    while @stack[0] != -1
+      add_word_to_candidate -1   # EOS
+      @stack.shift
+      @stack << -1
+    end
+    clear_stack
+  end
+  def add_word_to_candidate(word_id)
+    node = @tree
+    s = @stack.dup
+    while s.length > 0
+      wid = s.shift
+      node[wid] ||= {}
+      node = node[wid]
+    end
+    node[:cands] ||= []
+    node[:cands] << word_id
+  end
+  def clear_stack
+    @stack = [ 0 ] * @chain
+  end
+  class CLI
+    def self.main
+      chain       = 1
+      ignore_type = false
+      opt = OptionParser.new
+      opt.on('-c CHAIN', 'chain of precedences (default: 1)') {
+        |v| chain = v.to_i
+      }
+      opt.on('-n',       'ignore a part of speech') {
+        |v| ignore_type = v
+      }
+      opt.parse! ARGV
+      calculator = MarkovCalculator.new(:chain => chain,
+                                        :ignore_type => ignore_type)
+      Open3.popen3('mecab -O simple') { |stdin, stdout, stderr, wait_thr|
+        Thread.fork {
+          ARGF.set_encoding 'utf-8:utf-8'
+          ARGF.each do |line|
+            stdin.puts line.gsub(%r{(?: ^ [\s　]+ | [\s　]+ $ )}xmo, '')
+          end
+          stdin.close
+        }
+        calculator.input(stdout)
+      }
+      calculator.output_dictionary(STDOUT)
+    end
+  end
+end
+if __FILE__ == $0
+  MarkovCalculator::CLI.main
+end