cbeta 1.2.0 → 1.3.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 40faf97b4f5c3b0070145cefbdf86fe7eae39b72
4
- data.tar.gz: ebe45fbfc2bf8eb3e025754234ec7e23f6107c49
3
+ metadata.gz: a21aa8cd67f37fc6d5ca9d87720b1d5d0da3f7f1
4
+ data.tar.gz: bb7fb6cb3ab5d8b36492cc96728aa5a7fc9a7286
5
5
  SHA512:
6
- metadata.gz: 2d8bf5130fa58d65fe550c3855ba25b46714871e7e1abfe14d6b8546d3a2ca26e70ef1b23871cfc628b81f357fc83a84177a93a160ead4f6d75dd041c11ccd17
7
- data.tar.gz: cc189891bd08585916c2fe6fb0c645d280558a1f214443a0116b5d9595707a6586323c37d5d886a25bbb3b83357e51fba71ccb8505ce053e543c1d7659225135
6
+ metadata.gz: cb8bbfd96ad22c331e01059303ca539cb6718aae0caa551eba4a8064a7854d9f29bbd3bdb447bd06dd6a6f14b14363d16edb7b4f0932bab7fcfb0feccb9ba9bf
7
+ data.tar.gz: 2a4394f918a235d08790409488e5dc1a1fd280239d9e1df2ece6dc99628ffc67b73a5beeef199d35a5fe10f4c781cf7ee19910585ff3aa13719b6163b0725d46
@@ -7,6 +7,7 @@ require 'csv'
7
7
 
8
8
  class CBETA
9
9
  DATA = File.join(File.dirname(__FILE__), 'data')
10
+ PUNCS = '.[]。,、?「」『』《》<>〈〉〔〕[]【】〖〗'
10
11
 
11
12
  # 將行首資訊轉為引用格式
12
13
  #
@@ -23,6 +24,13 @@ class CBETA
23
24
  nil
24
25
  end
25
26
 
27
+ def self.open_xml(fn)
28
+ s = File.read(fn)
29
+ doc = Nokogiri::XML(s)
30
+ doc.remove_namespaces!()
31
+ doc
32
+ end
33
+
26
34
  # 傳入 蘭札體 缺字碼,傳回 Unicode PUA 字元
27
35
  def self.ranjana_pua(gid)
28
36
  i = 0x10000 + gid[-4..-1].hex
@@ -100,10 +108,13 @@ class CBETA
100
108
  def get_category(book_id)
101
109
  @categories[book_id]
102
110
  end
111
+
103
112
  end
104
113
 
105
114
  require 'cbeta/gaiji'
106
115
  require 'cbeta/bm_to_text'
116
+ require 'cbeta/char_count'
117
+ require 'cbeta/char_freq'
107
118
  require 'cbeta/html_to_pdf'
108
119
  require 'cbeta/p5a_to_epub'
109
120
  require 'cbeta/p5a_to_html'
@@ -0,0 +1,99 @@
1
+ class CBETA::CharCount
2
+ def initialize(xml_root)
3
+ @xml_root = xml_root
4
+ @result = {}
5
+ end
6
+
7
+ def char_count(canon=nil)
8
+ stat_all if canon.nil?
9
+ stat_canon(canon)
10
+ @result
11
+ end
12
+
13
+ private
14
+
15
+ def handle_node(e)
16
+ return if e.comment?
17
+ return handle_text(e) if e.text?
18
+ return if %w(foreign mulu rdg reg sic).include? e.name
19
+
20
+ case e.name
21
+ when 'g' then @result[@work] += 1
22
+ when 'note' then handle_note(e)
23
+ when 't' then handle_t(e)
24
+ else traverse(e)
25
+ end
26
+ end
27
+
28
+ def handle_note(e)
29
+ if %w(inline interlinear).include? e['place']
30
+ traverse(e)
31
+ end
32
+ end
33
+
34
+ def handle_t(e)
35
+ if e.has_attribute? 'place' and e['place'].include? 'foot'
36
+ return
37
+ end
38
+ traverse(e)
39
+ end
40
+
41
+ def handle_text(e)
42
+ s = e.content().chomp
43
+ return if s.empty?
44
+ return if e.parent.name == 'app'
45
+
46
+ # cbeta xml 文字之間會有多餘的換行
47
+ s.gsub!(/[\n\r]/, '')
48
+
49
+ @result[@work] += s.size
50
+ end
51
+
52
+ def stat_all
53
+ Dir.entries(@xml_root).sort.each do |canon|
54
+ next if canon.start_with? '.'
55
+ next if canon == 'schema'
56
+ stat_canon(canon)
57
+ end
58
+ end
59
+
60
+ def stat_canon(canon)
61
+ return if canon.nil?
62
+ puts 'stat canon: ' + canon
63
+ folder = File.join(@xml_root, canon)
64
+ Dir.entries(folder).sort.each do |vol|
65
+ next if vol.start_with? '.'
66
+ p = File.join(folder, vol)
67
+ stat_vol(p)
68
+ end
69
+ end
70
+
71
+ def stat_file(fn)
72
+ @work = File.basename(fn, '.xml')
73
+ @work.sub!(/^([A-Z])\d{2,3}n(.*)$/, '\1\2')
74
+ @work = 'T0220' if @work.start_with?('T0220')
75
+ unless @result.key? @work
76
+ puts "stat work: #{@work}"
77
+ @result[@work] = 0
78
+ end
79
+
80
+ doc = CBETA.open_xml(fn)
81
+ body = doc.at_xpath('/TEI/text/body')
82
+ traverse(body)
83
+ end
84
+
85
+ def stat_vol(vol_folder)
86
+ Dir.entries(vol_folder).sort.each do |f|
87
+ next if f.start_with? '.'
88
+ p = File.join(vol_folder, f)
89
+ stat_file(p)
90
+ end
91
+ end
92
+
93
+ def traverse(e)
94
+ e.children.each { |c|
95
+ handle_node(c)
96
+ }
97
+ end
98
+
99
+ end
@@ -0,0 +1,123 @@
1
+ class CBETA::CharFrequency
2
+ # @option opts [Integer] :top
3
+ def initialize(xml_root, opts={})
4
+ @xml_root = xml_root
5
+ @config = {
6
+ top: 10
7
+ }
8
+ @config.merge!(opts)
9
+ @result = {}
10
+ @current = @result
11
+ end
12
+
13
+ def char_freq(canon=nil)
14
+ stat_all if canon.nil?
15
+ stat_canon(canon)
16
+ r = @result.sort_by {|k,v| v}
17
+ r[(0-@config[:top])..-1].reverse
18
+ end
19
+
20
+ private
21
+
22
+ def count(c)
23
+ if @current.key? c
24
+ @current[c] += 1
25
+ else
26
+ @current[c] = 1
27
+ end
28
+ end
29
+
30
+ def handle_node(e)
31
+ return if e.comment?
32
+ return handle_text(e) if e.text?
33
+ return if %w(foreign mulu rdg reg sic).include? e.name
34
+
35
+ case e.name
36
+ when 'g' then count(e['ref'])
37
+ when 'note' then handle_note(e)
38
+ when 't' then handle_t(e)
39
+ else traverse(e)
40
+ end
41
+ end
42
+
43
+ def handle_note(e)
44
+ if %w(inline interlinear).include? e['place']
45
+ traverse(e)
46
+ end
47
+ end
48
+
49
+ def handle_t(e)
50
+ if e.has_attribute? 'place' and e['place'].include? 'foot'
51
+ return
52
+ end
53
+ traverse(e)
54
+ end
55
+
56
+ def handle_text(e)
57
+ s = e.content().chomp
58
+ return if s.empty?
59
+ return if e.parent.name == 'app'
60
+
61
+ # cbeta xml 文字之間會有多餘的換行
62
+ s.gsub!(/[\n\r]/, '')
63
+
64
+ s.each_char do |c|
65
+ next if CBETA::PUNCS.include? c
66
+ count(c)
67
+ end
68
+ end
69
+
70
+ def stat_all
71
+ Dir.entries(@xml_root).sort.each do |canon|
72
+ next if canon.start_with? '.'
73
+ next if canon == 'schema'
74
+ stat_canon(canon)
75
+ end
76
+ end
77
+
78
+ def stat_canon(canon)
79
+ return if canon.nil?
80
+ puts 'stat canon: ' + canon
81
+ if @config[:group_by] == 'canon'
82
+ @result[canon] = {}
83
+ @current = @result[canon]
84
+ end
85
+ folder = File.join(@xml_root, canon)
86
+ Dir.entries(folder).sort.each do |vol|
87
+ next if vol.start_with? '.'
88
+ p = File.join(folder, vol)
89
+ stat_vol(p)
90
+ end
91
+ end
92
+
93
+ def stat_file(fn)
94
+ if @config[:group_by] == 'work'
95
+ work = File.basename(fn, '.xml')
96
+ work.sub!(/^([A-Z])\d{2,3}n(.*)$/, '\1\2')
97
+ work = 'T0220' if work.start_with?('T0220')
98
+ puts "stat work: #{work}"
99
+ @result[work] = {}
100
+ @current = @result[work]
101
+ else
102
+ puts "stat file: #{fn}"
103
+ end
104
+ doc = CBETA.open_xml(fn)
105
+ body = doc.at_xpath('/TEI/text/body')
106
+ traverse(body)
107
+ end
108
+
109
+ def stat_vol(vol_folder)
110
+ Dir.entries(vol_folder).sort.each do |f|
111
+ next if f.start_with? '.'
112
+ p = File.join(vol_folder, f)
113
+ stat_file(p)
114
+ end
115
+ end
116
+
117
+ def traverse(e)
118
+ e.children.each { |c|
119
+ handle_node(c)
120
+ }
121
+ end
122
+
123
+ end
@@ -386,7 +386,7 @@ class CBETA::P5aToHTMLForPDF
386
386
  end
387
387
 
388
388
  def handle_p(e)
389
- "<p>%s</p>\n" % traverse(e)
389
+ "<div class='p'>%s</div>\n" % traverse(e)
390
390
  end
391
391
 
392
392
  def handle_row(e)
@@ -1,3 +1,11 @@
1
+ @font-face {
2
+ font-family: 'Songti';
3
+ /* src: url('/Library/Fonts/Songti.ttc'); */
4
+ src: url('/Library/Fonts/华文仿宋.ttf');
5
+ }
6
+ body {
7
+ font-family: Songti, PMingLiU-ExtB;
8
+ }
1
9
  div.lg {
2
10
  display: table;
3
11
  }
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cbeta
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.0
4
+ version: 1.3.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ray Chou
@@ -18,6 +18,8 @@ extra_rdoc_files: []
18
18
  files:
19
19
  - lib/cbeta.rb
20
20
  - lib/cbeta/bm_to_text.rb
21
+ - lib/cbeta/char_count.rb
22
+ - lib/cbeta/char_freq.rb
21
23
  - lib/cbeta/gaiji.rb
22
24
  - lib/cbeta/html_to_pdf.rb
23
25
  - lib/cbeta/html_to_text.rb