cbeta 1.2.0 → 1.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/cbeta.rb +11 -0
- data/lib/cbeta/char_count.rb +99 -0
- data/lib/cbeta/char_freq.rb +123 -0
- data/lib/cbeta/p5a_to_html_for_pdf.rb +1 -1
- data/lib/data/html-for-pdf.css +8 -0
- metadata +3 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a21aa8cd67f37fc6d5ca9d87720b1d5d0da3f7f1
|
4
|
+
data.tar.gz: bb7fb6cb3ab5d8b36492cc96728aa5a7fc9a7286
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: cb8bbfd96ad22c331e01059303ca539cb6718aae0caa551eba4a8064a7854d9f29bbd3bdb447bd06dd6a6f14b14363d16edb7b4f0932bab7fcfb0feccb9ba9bf
|
7
|
+
data.tar.gz: 2a4394f918a235d08790409488e5dc1a1fd280239d9e1df2ece6dc99628ffc67b73a5beeef199d35a5fe10f4c781cf7ee19910585ff3aa13719b6163b0725d46
|
data/lib/cbeta.rb
CHANGED
@@ -7,6 +7,7 @@ require 'csv'
|
|
7
7
|
|
8
8
|
class CBETA
|
9
9
|
DATA = File.join(File.dirname(__FILE__), 'data')
|
10
|
+
PUNCS = '.[]。,、?「」『』《》<>〈〉〔〕[]【】〖〗'
|
10
11
|
|
11
12
|
# 將行首資訊轉為引用格式
|
12
13
|
#
|
@@ -23,6 +24,13 @@ class CBETA
|
|
23
24
|
nil
|
24
25
|
end
|
25
26
|
|
27
|
+
def self.open_xml(fn)
|
28
|
+
s = File.read(fn)
|
29
|
+
doc = Nokogiri::XML(s)
|
30
|
+
doc.remove_namespaces!()
|
31
|
+
doc
|
32
|
+
end
|
33
|
+
|
26
34
|
# 傳入 蘭札體 缺字碼,傳回 Unicode PUA 字元
|
27
35
|
def self.ranjana_pua(gid)
|
28
36
|
i = 0x10000 + gid[-4..-1].hex
|
@@ -100,10 +108,13 @@ class CBETA
|
|
100
108
|
def get_category(book_id)
|
101
109
|
@categories[book_id]
|
102
110
|
end
|
111
|
+
|
103
112
|
end
|
104
113
|
|
105
114
|
require 'cbeta/gaiji'
|
106
115
|
require 'cbeta/bm_to_text'
|
116
|
+
require 'cbeta/char_count'
|
117
|
+
require 'cbeta/char_freq'
|
107
118
|
require 'cbeta/html_to_pdf'
|
108
119
|
require 'cbeta/p5a_to_epub'
|
109
120
|
require 'cbeta/p5a_to_html'
|
@@ -0,0 +1,99 @@
|
|
1
|
+
class CBETA::CharCount
|
2
|
+
def initialize(xml_root)
|
3
|
+
@xml_root = xml_root
|
4
|
+
@result = {}
|
5
|
+
end
|
6
|
+
|
7
|
+
def char_count(canon=nil)
|
8
|
+
stat_all if canon.nil?
|
9
|
+
stat_canon(canon)
|
10
|
+
@result
|
11
|
+
end
|
12
|
+
|
13
|
+
private
|
14
|
+
|
15
|
+
def handle_node(e)
|
16
|
+
return if e.comment?
|
17
|
+
return handle_text(e) if e.text?
|
18
|
+
return if %w(foreign mulu rdg reg sic).include? e.name
|
19
|
+
|
20
|
+
case e.name
|
21
|
+
when 'g' then @result[@work] += 1
|
22
|
+
when 'note' then handle_note(e)
|
23
|
+
when 't' then handle_t(e)
|
24
|
+
else traverse(e)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
def handle_note(e)
|
29
|
+
if %w(inline interlinear).include? e['place']
|
30
|
+
traverse(e)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def handle_t(e)
|
35
|
+
if e.has_attribute? 'place' and e['place'].include? 'foot'
|
36
|
+
return
|
37
|
+
end
|
38
|
+
traverse(e)
|
39
|
+
end
|
40
|
+
|
41
|
+
def handle_text(e)
|
42
|
+
s = e.content().chomp
|
43
|
+
return if s.empty?
|
44
|
+
return if e.parent.name == 'app'
|
45
|
+
|
46
|
+
# cbeta xml 文字之間會有多餘的換行
|
47
|
+
s.gsub!(/[\n\r]/, '')
|
48
|
+
|
49
|
+
@result[@work] += s.size
|
50
|
+
end
|
51
|
+
|
52
|
+
def stat_all
|
53
|
+
Dir.entries(@xml_root).sort.each do |canon|
|
54
|
+
next if canon.start_with? '.'
|
55
|
+
next if canon == 'schema'
|
56
|
+
stat_canon(canon)
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def stat_canon(canon)
|
61
|
+
return if canon.nil?
|
62
|
+
puts 'stat canon: ' + canon
|
63
|
+
folder = File.join(@xml_root, canon)
|
64
|
+
Dir.entries(folder).sort.each do |vol|
|
65
|
+
next if vol.start_with? '.'
|
66
|
+
p = File.join(folder, vol)
|
67
|
+
stat_vol(p)
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
def stat_file(fn)
|
72
|
+
@work = File.basename(fn, '.xml')
|
73
|
+
@work.sub!(/^([A-Z])\d{2,3}n(.*)$/, '\1\2')
|
74
|
+
@work = 'T0220' if @work.start_with?('T0220')
|
75
|
+
unless @result.key? @work
|
76
|
+
puts "stat work: #{@work}"
|
77
|
+
@result[@work] = 0
|
78
|
+
end
|
79
|
+
|
80
|
+
doc = CBETA.open_xml(fn)
|
81
|
+
body = doc.at_xpath('/TEI/text/body')
|
82
|
+
traverse(body)
|
83
|
+
end
|
84
|
+
|
85
|
+
def stat_vol(vol_folder)
|
86
|
+
Dir.entries(vol_folder).sort.each do |f|
|
87
|
+
next if f.start_with? '.'
|
88
|
+
p = File.join(vol_folder, f)
|
89
|
+
stat_file(p)
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
def traverse(e)
|
94
|
+
e.children.each { |c|
|
95
|
+
handle_node(c)
|
96
|
+
}
|
97
|
+
end
|
98
|
+
|
99
|
+
end
|
@@ -0,0 +1,123 @@
|
|
1
|
+
class CBETA::CharFrequency
|
2
|
+
# @option opts [Integer] :top
|
3
|
+
def initialize(xml_root, opts={})
|
4
|
+
@xml_root = xml_root
|
5
|
+
@config = {
|
6
|
+
top: 10
|
7
|
+
}
|
8
|
+
@config.merge!(opts)
|
9
|
+
@result = {}
|
10
|
+
@current = @result
|
11
|
+
end
|
12
|
+
|
13
|
+
def char_freq(canon=nil)
|
14
|
+
stat_all if canon.nil?
|
15
|
+
stat_canon(canon)
|
16
|
+
r = @result.sort_by {|k,v| v}
|
17
|
+
r[(0-@config[:top])..-1].reverse
|
18
|
+
end
|
19
|
+
|
20
|
+
private
|
21
|
+
|
22
|
+
def count(c)
|
23
|
+
if @current.key? c
|
24
|
+
@current[c] += 1
|
25
|
+
else
|
26
|
+
@current[c] = 1
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def handle_node(e)
|
31
|
+
return if e.comment?
|
32
|
+
return handle_text(e) if e.text?
|
33
|
+
return if %w(foreign mulu rdg reg sic).include? e.name
|
34
|
+
|
35
|
+
case e.name
|
36
|
+
when 'g' then count(e['ref'])
|
37
|
+
when 'note' then handle_note(e)
|
38
|
+
when 't' then handle_t(e)
|
39
|
+
else traverse(e)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def handle_note(e)
|
44
|
+
if %w(inline interlinear).include? e['place']
|
45
|
+
traverse(e)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def handle_t(e)
|
50
|
+
if e.has_attribute? 'place' and e['place'].include? 'foot'
|
51
|
+
return
|
52
|
+
end
|
53
|
+
traverse(e)
|
54
|
+
end
|
55
|
+
|
56
|
+
def handle_text(e)
|
57
|
+
s = e.content().chomp
|
58
|
+
return if s.empty?
|
59
|
+
return if e.parent.name == 'app'
|
60
|
+
|
61
|
+
# cbeta xml 文字之間會有多餘的換行
|
62
|
+
s.gsub!(/[\n\r]/, '')
|
63
|
+
|
64
|
+
s.each_char do |c|
|
65
|
+
next if CBETA::PUNCS.include? c
|
66
|
+
count(c)
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
def stat_all
|
71
|
+
Dir.entries(@xml_root).sort.each do |canon|
|
72
|
+
next if canon.start_with? '.'
|
73
|
+
next if canon == 'schema'
|
74
|
+
stat_canon(canon)
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
def stat_canon(canon)
|
79
|
+
return if canon.nil?
|
80
|
+
puts 'stat canon: ' + canon
|
81
|
+
if @config[:group_by] == 'canon'
|
82
|
+
@result[canon] = {}
|
83
|
+
@current = @result[canon]
|
84
|
+
end
|
85
|
+
folder = File.join(@xml_root, canon)
|
86
|
+
Dir.entries(folder).sort.each do |vol|
|
87
|
+
next if vol.start_with? '.'
|
88
|
+
p = File.join(folder, vol)
|
89
|
+
stat_vol(p)
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
def stat_file(fn)
|
94
|
+
if @config[:group_by] == 'work'
|
95
|
+
work = File.basename(fn, '.xml')
|
96
|
+
work.sub!(/^([A-Z])\d{2,3}n(.*)$/, '\1\2')
|
97
|
+
work = 'T0220' if work.start_with?('T0220')
|
98
|
+
puts "stat work: #{work}"
|
99
|
+
@result[work] = {}
|
100
|
+
@current = @result[work]
|
101
|
+
else
|
102
|
+
puts "stat file: #{fn}"
|
103
|
+
end
|
104
|
+
doc = CBETA.open_xml(fn)
|
105
|
+
body = doc.at_xpath('/TEI/text/body')
|
106
|
+
traverse(body)
|
107
|
+
end
|
108
|
+
|
109
|
+
def stat_vol(vol_folder)
|
110
|
+
Dir.entries(vol_folder).sort.each do |f|
|
111
|
+
next if f.start_with? '.'
|
112
|
+
p = File.join(vol_folder, f)
|
113
|
+
stat_file(p)
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
def traverse(e)
|
118
|
+
e.children.each { |c|
|
119
|
+
handle_node(c)
|
120
|
+
}
|
121
|
+
end
|
122
|
+
|
123
|
+
end
|
data/lib/data/html-for-pdf.css
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cbeta
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.3.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ray Chou
|
@@ -18,6 +18,8 @@ extra_rdoc_files: []
|
|
18
18
|
files:
|
19
19
|
- lib/cbeta.rb
|
20
20
|
- lib/cbeta/bm_to_text.rb
|
21
|
+
- lib/cbeta/char_count.rb
|
22
|
+
- lib/cbeta/char_freq.rb
|
21
23
|
- lib/cbeta/gaiji.rb
|
22
24
|
- lib/cbeta/html_to_pdf.rb
|
23
25
|
- lib/cbeta/html_to_text.rb
|