RubyGems - cbeta - Versions diffs - 0.1.0 → 0.2.0 - Mend

cbeta 0.1.0 → 0.2.0

Files changed (4) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 4534f862bd8b4825c28655db9d68d47d0f42bfc0
-  data.tar.gz: 9dec47ef7f7989cc7c4e1c8f821fd74d38e78c79
+  metadata.gz: 8fa52e9b0b8aedcc963fb3fe04e7671e11195136
+  data.tar.gz: ce0abdc6a26880da654608dd5e23bdb227e28e54
 SHA512:
-  metadata.gz: dc320239449683133eb32a0a7c357fca63ddb39243ce800d5c8063396efcd59f7ff81cb51b163af2eea6dc7771ba2ac4c640ebdf2daf5e7b67716ebab2b478e5
-  data.tar.gz: 583873fba9159f24d3cd006122c15e752a072aa58df3dd2f8c8d1e15e26a4a3b1ba4454f56cb8d91ebcde685ee7a37bbc05a768a96e1e92f426dcc193c7877ad
+  metadata.gz: e201a6601286381216794fd9cf704785a01339782813395f51331037028b57d498a3c58852e1ce5ffcd2597ae5eec356be86f10c55c3e08619cd230e78019bb4
+  data.tar.gz: b3487594d36f4e698f2bd61ce75a69c7d057975fa20e68683293782e580f43ede41afc4619822a49447aad5bac8f1a6d19d22f7e7e0c9d68b3f7f6f62d1b89e9

data/lib/cbeta.rb CHANGED Viewed

@@ -51,4 +51,5 @@ end
 require 'cbeta/gaiji'
 require 'cbeta/bm_to_text'
 require 'cbeta/p5a_to_html'
+require 'cbeta/p5a_to_text'
 require 'cbeta/html_to_text'

data/lib/cbeta/p5a_to_text.rb ADDED Viewed

@@ -0,0 +1,434 @@
+require 'cgi'
+require 'date'
+require 'fileutils'
+require 'json'
+require 'nokogiri'
+require 'set'
+# Convert CBETA XML P5a to Text
+#
+# CBETA XML P5a 可由此取得: https://github.com/cbeta-git/xml-p5a
+class CBETA::P5aToText
+  # @param xml_root [String] 來源 CBETA XML P5a 路徑
+  # @param output_root [String] 輸出 Text 路徑
+  def initialize(xml_root, output_root)
+    @xml_root = xml_root
+    @output_root = output_root
+    @cbeta = CBETA.new
+    @gaijis = CBETA::Gaiji.new
+    # 載入 unicode 1.1 字集列表
+    fn = File.join(File.dirname(__FILE__), 'unicode-1.1.json')
+    json = File.read(fn)
+    @unicode1 = JSON.parse(json)
+  end
+  # 將 CBETA XML P5a 轉為 Text
+  #
+  # @example for convert 大正藏第一冊:
+  #
+  #   x2h = CBETA::P5aToText.new('/PATH/TO/CBETA/XML/P5a', '/OUTPUT/FOLDER')
+  #   x2h.convert('T01')
+  #
+  # @example for convert 大正藏全部:
+  #
+  #   x2h = CBETA::P5aToText.new('/PATH/TO/CBETA/XML/P5a', '/OUTPUT/FOLDER')
+  #   x2h.convert('T')
+  #
+  # @example for convert 大正藏第五冊至第七冊:
+  #
+  #   x2h = CBETA::P5aToText.new('/PATH/TO/CBETA/XML/P5a', '/OUTPUT/FOLDER')
+  #   x2h.convert('T05..T07')
+  #
+  # T 是大正藏的 ID, CBETA 的藏經 ID 系統請參考: http://www.cbeta.org/format/id.php
+  def convert(target=nil)
+    return convert_all if target.nil?
+    arg = target.upcase
+    if arg.size == 1
+      handle_collection(arg)
+    else
+      if arg.include? '..'
+        arg.match(/^([^\.]+?)\.\.([^\.]+)$/) {
+          handle_vols($1, $2)
+        }
+      else
+        handle_vol(arg)
+      end
+    end
+  end
+  private
+  def convert_all
+    Dir.foreach(@xml_root) { |c|
+      next unless c.match(/^[A-Z]$/)
+      handle_collection(c)
+    }
+  end
+  def handle_anchor(e)
+    if e.has_attribute?('type')
+      if e['type'] == 'circle'
+        return '◎'
+      end
+    end
+    ''
+  end
+  def handle_app(e)
+    traverse(e)
+  end
+  def handle_byline(e)
+    traverse(e) + "\n"
+  end
+  def handle_cell(e)
+    traverse(e) + "\n"
+  end
+  def handle_collection(c)
+    @series = c
+    puts 'handle_collection ' + c
+    folder = File.join(@xml_root, @series)
+    Dir.foreach(folder) { |vol|
+      next if ['.', '..', '.DS_Store'].include? vol
+      handle_vol(vol)
+    }
+  end
+  def handle_corr(e)
+    "<r w='【CBETA】'>%s</r>" % traverse(e)
+  end
+  def handle_div(e)
+    traverse(e)
+  end
+  def handle_figure(e)
+    traverse(e) + "\n"
+  end
+  def handle_g(e)
+    # if 有 <mapping type="unicode">
+    #   直接採用
+    # else if 有 <mapping type="normal_unicode">
+    #   採用 normal_unicode
+    # else if 有 normalized form
+    #   採用 normalized form
+    # else
+    #   Unicode PUA
+    gid = e['ref'][1..-1]
+    g = @gaijis[gid]
+    abort "Line:#{__LINE__} 無缺字資料:#{gid}" if g.nil?
+    zzs = g['zzs']
+    if gid.start_with?('SD')
+      case gid
+      when 'SD-E35A'
+        return '（'
+      when 'SD-E35B'
+        return '）'
+      else
+        return g['roman']
+      end
+    end
+    return g['roman'] if gid.start_with?('RJ')
+    return g['unicode-char'] if g.has_key?('unicode')
+    return g['normal_unicode'] if g.has_key?('normal_unicode')
+    return g['normal'] if g.has_key?('normal')
+    # Unicode PUA
+    [0xf0000 + gid[2..-1].to_i].pack 'U'
+  end
+  def handle_graphic(e)
+    ''
+  end
+  def handle_head(e)
+    traverse(e) + "\n"
+  end
+  def handle_item(e)
+    traverse(e) + "\n"
+  end
+  def handle_juan(e)
+    traverse(e) + "\n"
+  end
+  def handle_l(e)
+    r = traverse(e)
+    unless @lg_type == 'abnormal'
+      r += "\n"
+    end
+    r
+  end
+  def handle_lb(e)
+    r = ''
+    unless @next_line_buf.empty?
+      r += @next_line_buf + "\n"
+      @next_line_buf = ''
+    end
+    r
+  end
+  def handle_lem(e)
+    r = ''
+    r = traverse(e)
+    w = e['wit'].scan(/【.*?】/)
+    @editions.merge w
+    w = w.join(' ')
+    "<r w='#{w}'>#{r}</r>"
+  end
+  def handle_lg(e)
+    traverse(e)
+  end
+  def handle_list(e)
+    "\n" + traverse(e)
+  end
+  def handle_milestone(e)
+    r = ''
+    if e['unit'] == 'juan'
+      @juan = e['n'].to_i
+      r += "<juan #{@juan}>"
+    end
+    r
+  end
+  def handle_mulu(e)
+    ''
+  end
+  def handle_node(e)
+    return '' if e.comment?
+    return handle_text(e) if e.text?
+    return '' if PASS.include?(e.name)
+    r = case e.name
+    when 'anchor'    then handle_anchor(e)
+    when 'app'       then handle_app(e)
+    when 'back'      then ''
+    when 'byline'    then handle_byline(e)
+    when 'cell'      then handle_cell(e)
+    when 'corr'      then handle_corr(e)
+    when 'div'       then handle_div(e)
+    when 'figure'    then handle_figure(e)
+    when 'foreign'   then ''
+    when 'g'         then handle_g(e)
+    when 'graphic'   then handle_graphic(e)
+    when 'head'      then handle_head(e)
+    when 'item'      then handle_item(e)
+    when 'juan'      then handle_juan(e)
+    when 'l'         then handle_l(e)
+    when 'lb'        then handle_lb(e)
+    when 'lem'       then handle_lem(e)
+    when 'lg'        then handle_lg(e)
+    when 'list'      then handle_list(e)
+    when 'mulu'      then handle_mulu(e)
+    when 'note'      then handle_note(e)
+    when 'milestone' then handle_milestone(e)
+    when 'p'         then handle_p(e)
+    when 'rdg'       then handle_rdg(e)
+    when 'reg'       then ''
+    when 'row'       then handle_row(e)
+    when 'sic'       then handle_sic(e)
+    when 'sg'        then handle_sg(e)
+    when 't'         then handle_t(e)
+    when 'table'     then handle_table(e)
+    when 'teiHeader' then ''
+    else traverse(e)
+    end
+    r
+  end
+  def handle_note(e)
+    if e.has_attribute?('place') && e['place']=='inline'
+      r = traverse(e)
+      return "（#{r}）"
+    end
+    ''
+  end
+  def handle_p(e)
+    traverse(e) + "\n"
+  end
+  def handle_rdg(e)
+    r = traverse(e)
+    w = e['wit'].scan(/【.*?】/)
+    @editions.merge w
+    "<r w='#{e['wit']}'>#{r}</r>"
+  end
+  def handle_row(e)
+    traverse(e)
+  end
+  def handle_sg(e)
+    '(' + traverse(e) + ')'
+  end
+  def handle_sic(e)
+    "<r w='#{@orig}'>" + traverse(e) + "</r>"
+  end
+  def handle_sutra(xml_fn)
+    puts "convert sutra #{xml_fn}"
+    @dila_note = 0
+    @div_count = 0
+    @editions = Set.new ["【CBETA】"]
+    @in_l = false
+    @juan = 0
+    @lg_row_open = false
+    @mod_notes = Set.new
+    @next_line_buf = ''
+    @open_divs = []
+    @sutra_no = File.basename(xml_fn, ".xml")
+    text = parse_xml(xml_fn)
+    # 大正藏 No. 220 大般若經跨冊，CBETA 分成多檔並在檔尾加上 a, b, c....
+    # 輸出時去掉這些檔尾的 a, b, b....
+    if @sutra_no.match(/^(T05|T06|T07)n0220/)
+      @sutra_no = "#{$1}n0220"
+    end
+    @out_sutra = File.join(@out_vol, @sutra_no)
+    FileUtils.makedirs @out_sutra
+    juans = text.split(/(<juan \d+>)/)
+    open = false
+    fo = nil
+    juan_no = nil
+    fn = ''
+    buf = ''
+    # 一卷一檔
+    juans.each { |j|
+      if j =~ /<juan (\d+)>$/
+        juan_no = $1.to_i
+      else
+        if juan_no.nil?
+          buf = j
+        else
+          write_juan(juan_no, buf+j)
+          buf = ''
+        end
+      end
+    }
+  end
+  def handle_t(e)
+    if e.has_attribute? 'place'
+      return '' if e['place'].include? 'foot'
+    end
+    r = traverse(e)
+    # 處理雙行對照
+    i = e.xpath('../t').index(e)
+    case i
+    when 0
+      return r + '　'
+    when 1
+      @next_line_buf += r + '　'
+      return ''
+    else
+      return r
+    end
+  end
+  def handle_table(e)
+    traverse(e)
+  end
+  def handle_text(e)
+    s = e.content().chomp
+    return '' if s.empty?
+    return '' if e.parent.name == 'app'
+    # cbeta xml 文字之間會有多餘的換行
+    r = s.gsub(/[\n\r]/, '')
+    # 把 & 轉為 &amp;
+    CGI.escapeHTML(r)
+  end
+  def handle_vol(vol)
+    puts "convert volumn: #{vol}"
+    @orig = @cbeta.get_canon_abbr(vol[0])
+    abort "未處理底本" if @orig.nil?
+    @vol = vol
+    @series = vol[0]
+    @out_vol = File.join(@output_root, @series, vol)
+    FileUtils.remove_dir(@out_vol, force=true)
+    FileUtils.makedirs @out_vol
+    source = File.join(@xml_root, @series, vol)
+    Dir[source+"/*"].each { |f|
+      handle_sutra(f)
+    }
+  end
+  def handle_vols(v1, v2)
+    puts "convert volumns: #{v1}..#{v2}"
+    @series = v1[0]
+    folder = File.join(@xml_root, @series)
+    Dir.foreach(folder) { |vol|
+      next if vol < v1
+      next if vol > v2
+      handle_vol(vol)
+    }
+  end
+  def open_xml(fn)
+    s = File.read(fn)
+    doc = Nokogiri::XML(s)
+    doc.remove_namespaces!()
+    doc
+  end
+  def parse_xml(xml_fn)
+    doc = open_xml(xml_fn)
+    root = doc.root()
+    body = root.xpath("text/body")[0]
+    traverse(body)
+  end
+  def traverse(e)
+    r = ''
+    e.children.each { |c|
+      s = handle_node(c)
+      r += s
+    }
+    r
+  end
+  def write_juan(juan_no, txt)
+    @editions.each do |ed|
+      frag = Nokogiri::XML.fragment(txt)
+      frag.search("r").each do |node|
+        if node['w'] != ed
+          node.remove
+        end
+      end
+      folder = File.join(@out_sutra, ed)
+      FileUtils.makedirs(folder)
+      fn = "#{@sutra_no}_%03d.txt" % juan_no
+      output_path = File.join(folder, fn)
+      File.write(output_path, frag.content)
+    end
+  end
+end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: cbeta
 version: !ruby/object:Gem::Version
-  version: 0.1.0
+  version: 0.2.0
 platform: ruby
 authors:
 - Ray Chou
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2015-06-11 00:00:00.000000000 Z
+date: 2015-06-23 00:00:00.000000000 Z
 dependencies: []
 description: Ruby gem for use Chinese Buddhist Text resources made by CBETA (http://www.cbeta.org).
 email: zhoubx@gmail.com
@@ -23,6 +23,7 @@ files:
 - lib/cbeta/gaiji.rb
 - lib/cbeta/html_to_text.rb
 - lib/cbeta/p5a_to_html.rb
+- lib/cbeta/p5a_to_text.rb
 - lib/cbeta/unicode-1.1.json
 homepage: https://github.com/RayCHOU/ruby-cbeta
 licenses: