cbeta 1.2.0 → 1.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 40faf97b4f5c3b0070145cefbdf86fe7eae39b72
4
- data.tar.gz: ebe45fbfc2bf8eb3e025754234ec7e23f6107c49
3
+ metadata.gz: a21aa8cd67f37fc6d5ca9d87720b1d5d0da3f7f1
4
+ data.tar.gz: bb7fb6cb3ab5d8b36492cc96728aa5a7fc9a7286
5
5
  SHA512:
6
- metadata.gz: 2d8bf5130fa58d65fe550c3855ba25b46714871e7e1abfe14d6b8546d3a2ca26e70ef1b23871cfc628b81f357fc83a84177a93a160ead4f6d75dd041c11ccd17
7
- data.tar.gz: cc189891bd08585916c2fe6fb0c645d280558a1f214443a0116b5d9595707a6586323c37d5d886a25bbb3b83357e51fba71ccb8505ce053e543c1d7659225135
6
+ metadata.gz: cb8bbfd96ad22c331e01059303ca539cb6718aae0caa551eba4a8064a7854d9f29bbd3bdb447bd06dd6a6f14b14363d16edb7b4f0932bab7fcfb0feccb9ba9bf
7
+ data.tar.gz: 2a4394f918a235d08790409488e5dc1a1fd280239d9e1df2ece6dc99628ffc67b73a5beeef199d35a5fe10f4c781cf7ee19910585ff3aa13719b6163b0725d46
@@ -7,6 +7,7 @@ require 'csv'
7
7
 
8
8
  class CBETA
9
9
  DATA = File.join(File.dirname(__FILE__), 'data')
10
+ PUNCS = '.[]。,、?「」『』《》<>〈〉〔〕[]【】〖〗'
10
11
 
11
12
  # 將行首資訊轉為引用格式
12
13
  #
@@ -23,6 +24,13 @@ class CBETA
23
24
  nil
24
25
  end
25
26
 
27
+ def self.open_xml(fn)
28
+ s = File.read(fn)
29
+ doc = Nokogiri::XML(s)
30
+ doc.remove_namespaces!()
31
+ doc
32
+ end
33
+
26
34
  # 傳入 蘭札體 缺字碼,傳回 Unicode PUA 字元
27
35
  def self.ranjana_pua(gid)
28
36
  i = 0x10000 + gid[-4..-1].hex
@@ -100,10 +108,13 @@ class CBETA
100
108
  def get_category(book_id)
101
109
  @categories[book_id]
102
110
  end
111
+
103
112
  end
104
113
 
105
114
  require 'cbeta/gaiji'
106
115
  require 'cbeta/bm_to_text'
116
+ require 'cbeta/char_count'
117
+ require 'cbeta/char_freq'
107
118
  require 'cbeta/html_to_pdf'
108
119
  require 'cbeta/p5a_to_epub'
109
120
  require 'cbeta/p5a_to_html'
@@ -0,0 +1,99 @@
1
+ class CBETA::CharCount
2
+ def initialize(xml_root)
3
+ @xml_root = xml_root
4
+ @result = {}
5
+ end
6
+
7
+ def char_count(canon=nil)
8
+ stat_all if canon.nil?
9
+ stat_canon(canon)
10
+ @result
11
+ end
12
+
13
+ private
14
+
15
+ def handle_node(e)
16
+ return if e.comment?
17
+ return handle_text(e) if e.text?
18
+ return if %w(foreign mulu rdg reg sic).include? e.name
19
+
20
+ case e.name
21
+ when 'g' then @result[@work] += 1
22
+ when 'note' then handle_note(e)
23
+ when 't' then handle_t(e)
24
+ else traverse(e)
25
+ end
26
+ end
27
+
28
+ def handle_note(e)
29
+ if %w(inline interlinear).include? e['place']
30
+ traverse(e)
31
+ end
32
+ end
33
+
34
+ def handle_t(e)
35
+ if e.has_attribute? 'place' and e['place'].include? 'foot'
36
+ return
37
+ end
38
+ traverse(e)
39
+ end
40
+
41
+ def handle_text(e)
42
+ s = e.content().chomp
43
+ return if s.empty?
44
+ return if e.parent.name == 'app'
45
+
46
+ # cbeta xml 文字之間會有多餘的換行
47
+ s.gsub!(/[\n\r]/, '')
48
+
49
+ @result[@work] += s.size
50
+ end
51
+
52
+ def stat_all
53
+ Dir.entries(@xml_root).sort.each do |canon|
54
+ next if canon.start_with? '.'
55
+ next if canon == 'schema'
56
+ stat_canon(canon)
57
+ end
58
+ end
59
+
60
+ def stat_canon(canon)
61
+ return if canon.nil?
62
+ puts 'stat canon: ' + canon
63
+ folder = File.join(@xml_root, canon)
64
+ Dir.entries(folder).sort.each do |vol|
65
+ next if vol.start_with? '.'
66
+ p = File.join(folder, vol)
67
+ stat_vol(p)
68
+ end
69
+ end
70
+
71
+ def stat_file(fn)
72
+ @work = File.basename(fn, '.xml')
73
+ @work.sub!(/^([A-Z])\d{2,3}n(.*)$/, '\1\2')
74
+ @work = 'T0220' if @work.start_with?('T0220')
75
+ unless @result.key? @work
76
+ puts "stat work: #{@work}"
77
+ @result[@work] = 0
78
+ end
79
+
80
+ doc = CBETA.open_xml(fn)
81
+ body = doc.at_xpath('/TEI/text/body')
82
+ traverse(body)
83
+ end
84
+
85
+ def stat_vol(vol_folder)
86
+ Dir.entries(vol_folder).sort.each do |f|
87
+ next if f.start_with? '.'
88
+ p = File.join(vol_folder, f)
89
+ stat_file(p)
90
+ end
91
+ end
92
+
93
+ def traverse(e)
94
+ e.children.each { |c|
95
+ handle_node(c)
96
+ }
97
+ end
98
+
99
+ end
@@ -0,0 +1,123 @@
1
+ class CBETA::CharFrequency
2
+ # @option opts [Integer] :top
3
+ def initialize(xml_root, opts={})
4
+ @xml_root = xml_root
5
+ @config = {
6
+ top: 10
7
+ }
8
+ @config.merge!(opts)
9
+ @result = {}
10
+ @current = @result
11
+ end
12
+
13
+ def char_freq(canon=nil)
14
+ stat_all if canon.nil?
15
+ stat_canon(canon)
16
+ r = @result.sort_by {|k,v| v}
17
+ r[(0-@config[:top])..-1].reverse
18
+ end
19
+
20
+ private
21
+
22
+ def count(c)
23
+ if @current.key? c
24
+ @current[c] += 1
25
+ else
26
+ @current[c] = 1
27
+ end
28
+ end
29
+
30
+ def handle_node(e)
31
+ return if e.comment?
32
+ return handle_text(e) if e.text?
33
+ return if %w(foreign mulu rdg reg sic).include? e.name
34
+
35
+ case e.name
36
+ when 'g' then count(e['ref'])
37
+ when 'note' then handle_note(e)
38
+ when 't' then handle_t(e)
39
+ else traverse(e)
40
+ end
41
+ end
42
+
43
+ def handle_note(e)
44
+ if %w(inline interlinear).include? e['place']
45
+ traverse(e)
46
+ end
47
+ end
48
+
49
+ def handle_t(e)
50
+ if e.has_attribute? 'place' and e['place'].include? 'foot'
51
+ return
52
+ end
53
+ traverse(e)
54
+ end
55
+
56
+ def handle_text(e)
57
+ s = e.content().chomp
58
+ return if s.empty?
59
+ return if e.parent.name == 'app'
60
+
61
+ # cbeta xml 文字之間會有多餘的換行
62
+ s.gsub!(/[\n\r]/, '')
63
+
64
+ s.each_char do |c|
65
+ next if CBETA::PUNCS.include? c
66
+ count(c)
67
+ end
68
+ end
69
+
70
+ def stat_all
71
+ Dir.entries(@xml_root).sort.each do |canon|
72
+ next if canon.start_with? '.'
73
+ next if canon == 'schema'
74
+ stat_canon(canon)
75
+ end
76
+ end
77
+
78
+ def stat_canon(canon)
79
+ return if canon.nil?
80
+ puts 'stat canon: ' + canon
81
+ if @config[:group_by] == 'canon'
82
+ @result[canon] = {}
83
+ @current = @result[canon]
84
+ end
85
+ folder = File.join(@xml_root, canon)
86
+ Dir.entries(folder).sort.each do |vol|
87
+ next if vol.start_with? '.'
88
+ p = File.join(folder, vol)
89
+ stat_vol(p)
90
+ end
91
+ end
92
+
93
+ def stat_file(fn)
94
+ if @config[:group_by] == 'work'
95
+ work = File.basename(fn, '.xml')
96
+ work.sub!(/^([A-Z])\d{2,3}n(.*)$/, '\1\2')
97
+ work = 'T0220' if work.start_with?('T0220')
98
+ puts "stat work: #{work}"
99
+ @result[work] = {}
100
+ @current = @result[work]
101
+ else
102
+ puts "stat file: #{fn}"
103
+ end
104
+ doc = CBETA.open_xml(fn)
105
+ body = doc.at_xpath('/TEI/text/body')
106
+ traverse(body)
107
+ end
108
+
109
+ def stat_vol(vol_folder)
110
+ Dir.entries(vol_folder).sort.each do |f|
111
+ next if f.start_with? '.'
112
+ p = File.join(vol_folder, f)
113
+ stat_file(p)
114
+ end
115
+ end
116
+
117
+ def traverse(e)
118
+ e.children.each { |c|
119
+ handle_node(c)
120
+ }
121
+ end
122
+
123
+ end
@@ -386,7 +386,7 @@ class CBETA::P5aToHTMLForPDF
386
386
  end
387
387
 
388
388
  def handle_p(e)
389
- "<p>%s</p>\n" % traverse(e)
389
+ "<div class='p'>%s</div>\n" % traverse(e)
390
390
  end
391
391
 
392
392
  def handle_row(e)
@@ -1,3 +1,11 @@
1
+ @font-face {
2
+ font-family: 'Songti';
3
+ /* src: url('/Library/Fonts/Songti.ttc'); */
4
+ src: url('/Library/Fonts/华文仿宋.ttf');
5
+ }
6
+ body {
7
+ font-family: Songti, PMingLiU-ExtB;
8
+ }
1
9
  div.lg {
2
10
  display: table;
3
11
  }
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cbeta
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.0
4
+ version: 1.3.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ray Chou
@@ -18,6 +18,8 @@ extra_rdoc_files: []
18
18
  files:
19
19
  - lib/cbeta.rb
20
20
  - lib/cbeta/bm_to_text.rb
21
+ - lib/cbeta/char_count.rb
22
+ - lib/cbeta/char_freq.rb
21
23
  - lib/cbeta/gaiji.rb
22
24
  - lib/cbeta/html_to_pdf.rb
23
25
  - lib/cbeta/html_to_text.rb