cbeta 1.2.0 → 1.3.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/cbeta.rb +11 -0
- data/lib/cbeta/char_count.rb +99 -0
- data/lib/cbeta/char_freq.rb +123 -0
- data/lib/cbeta/p5a_to_html_for_pdf.rb +1 -1
- data/lib/data/html-for-pdf.css +8 -0
- metadata +3 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a21aa8cd67f37fc6d5ca9d87720b1d5d0da3f7f1
|
4
|
+
data.tar.gz: bb7fb6cb3ab5d8b36492cc96728aa5a7fc9a7286
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: cb8bbfd96ad22c331e01059303ca539cb6718aae0caa551eba4a8064a7854d9f29bbd3bdb447bd06dd6a6f14b14363d16edb7b4f0932bab7fcfb0feccb9ba9bf
|
7
|
+
data.tar.gz: 2a4394f918a235d08790409488e5dc1a1fd280239d9e1df2ece6dc99628ffc67b73a5beeef199d35a5fe10f4c781cf7ee19910585ff3aa13719b6163b0725d46
|
data/lib/cbeta.rb
CHANGED
@@ -7,6 +7,7 @@ require 'csv'
|
|
7
7
|
|
8
8
|
class CBETA
|
9
9
|
DATA = File.join(File.dirname(__FILE__), 'data')
|
10
|
+
PUNCS = '.[]。,、?「」『』《》<>〈〉〔〕[]【】〖〗'
|
10
11
|
|
11
12
|
# 將行首資訊轉為引用格式
|
12
13
|
#
|
@@ -23,6 +24,13 @@ class CBETA
|
|
23
24
|
nil
|
24
25
|
end
|
25
26
|
|
27
|
+
def self.open_xml(fn)
|
28
|
+
s = File.read(fn)
|
29
|
+
doc = Nokogiri::XML(s)
|
30
|
+
doc.remove_namespaces!()
|
31
|
+
doc
|
32
|
+
end
|
33
|
+
|
26
34
|
# 傳入 蘭札體 缺字碼,傳回 Unicode PUA 字元
|
27
35
|
def self.ranjana_pua(gid)
|
28
36
|
i = 0x10000 + gid[-4..-1].hex
|
@@ -100,10 +108,13 @@ class CBETA
|
|
100
108
|
def get_category(book_id)
|
101
109
|
@categories[book_id]
|
102
110
|
end
|
111
|
+
|
103
112
|
end
|
104
113
|
|
105
114
|
require 'cbeta/gaiji'
|
106
115
|
require 'cbeta/bm_to_text'
|
116
|
+
require 'cbeta/char_count'
|
117
|
+
require 'cbeta/char_freq'
|
107
118
|
require 'cbeta/html_to_pdf'
|
108
119
|
require 'cbeta/p5a_to_epub'
|
109
120
|
require 'cbeta/p5a_to_html'
|
@@ -0,0 +1,99 @@
|
|
1
|
+
class CBETA::CharCount
|
2
|
+
def initialize(xml_root)
|
3
|
+
@xml_root = xml_root
|
4
|
+
@result = {}
|
5
|
+
end
|
6
|
+
|
7
|
+
def char_count(canon=nil)
|
8
|
+
stat_all if canon.nil?
|
9
|
+
stat_canon(canon)
|
10
|
+
@result
|
11
|
+
end
|
12
|
+
|
13
|
+
private
|
14
|
+
|
15
|
+
def handle_node(e)
|
16
|
+
return if e.comment?
|
17
|
+
return handle_text(e) if e.text?
|
18
|
+
return if %w(foreign mulu rdg reg sic).include? e.name
|
19
|
+
|
20
|
+
case e.name
|
21
|
+
when 'g' then @result[@work] += 1
|
22
|
+
when 'note' then handle_note(e)
|
23
|
+
when 't' then handle_t(e)
|
24
|
+
else traverse(e)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
def handle_note(e)
|
29
|
+
if %w(inline interlinear).include? e['place']
|
30
|
+
traverse(e)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def handle_t(e)
|
35
|
+
if e.has_attribute? 'place' and e['place'].include? 'foot'
|
36
|
+
return
|
37
|
+
end
|
38
|
+
traverse(e)
|
39
|
+
end
|
40
|
+
|
41
|
+
def handle_text(e)
|
42
|
+
s = e.content().chomp
|
43
|
+
return if s.empty?
|
44
|
+
return if e.parent.name == 'app'
|
45
|
+
|
46
|
+
# cbeta xml 文字之間會有多餘的換行
|
47
|
+
s.gsub!(/[\n\r]/, '')
|
48
|
+
|
49
|
+
@result[@work] += s.size
|
50
|
+
end
|
51
|
+
|
52
|
+
def stat_all
|
53
|
+
Dir.entries(@xml_root).sort.each do |canon|
|
54
|
+
next if canon.start_with? '.'
|
55
|
+
next if canon == 'schema'
|
56
|
+
stat_canon(canon)
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def stat_canon(canon)
|
61
|
+
return if canon.nil?
|
62
|
+
puts 'stat canon: ' + canon
|
63
|
+
folder = File.join(@xml_root, canon)
|
64
|
+
Dir.entries(folder).sort.each do |vol|
|
65
|
+
next if vol.start_with? '.'
|
66
|
+
p = File.join(folder, vol)
|
67
|
+
stat_vol(p)
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
def stat_file(fn)
|
72
|
+
@work = File.basename(fn, '.xml')
|
73
|
+
@work.sub!(/^([A-Z])\d{2,3}n(.*)$/, '\1\2')
|
74
|
+
@work = 'T0220' if @work.start_with?('T0220')
|
75
|
+
unless @result.key? @work
|
76
|
+
puts "stat work: #{@work}"
|
77
|
+
@result[@work] = 0
|
78
|
+
end
|
79
|
+
|
80
|
+
doc = CBETA.open_xml(fn)
|
81
|
+
body = doc.at_xpath('/TEI/text/body')
|
82
|
+
traverse(body)
|
83
|
+
end
|
84
|
+
|
85
|
+
def stat_vol(vol_folder)
|
86
|
+
Dir.entries(vol_folder).sort.each do |f|
|
87
|
+
next if f.start_with? '.'
|
88
|
+
p = File.join(vol_folder, f)
|
89
|
+
stat_file(p)
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
def traverse(e)
|
94
|
+
e.children.each { |c|
|
95
|
+
handle_node(c)
|
96
|
+
}
|
97
|
+
end
|
98
|
+
|
99
|
+
end
|
@@ -0,0 +1,123 @@
|
|
1
|
+
class CBETA::CharFrequency
|
2
|
+
# @option opts [Integer] :top
|
3
|
+
def initialize(xml_root, opts={})
|
4
|
+
@xml_root = xml_root
|
5
|
+
@config = {
|
6
|
+
top: 10
|
7
|
+
}
|
8
|
+
@config.merge!(opts)
|
9
|
+
@result = {}
|
10
|
+
@current = @result
|
11
|
+
end
|
12
|
+
|
13
|
+
def char_freq(canon=nil)
|
14
|
+
stat_all if canon.nil?
|
15
|
+
stat_canon(canon)
|
16
|
+
r = @result.sort_by {|k,v| v}
|
17
|
+
r[(0-@config[:top])..-1].reverse
|
18
|
+
end
|
19
|
+
|
20
|
+
private
|
21
|
+
|
22
|
+
def count(c)
|
23
|
+
if @current.key? c
|
24
|
+
@current[c] += 1
|
25
|
+
else
|
26
|
+
@current[c] = 1
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def handle_node(e)
|
31
|
+
return if e.comment?
|
32
|
+
return handle_text(e) if e.text?
|
33
|
+
return if %w(foreign mulu rdg reg sic).include? e.name
|
34
|
+
|
35
|
+
case e.name
|
36
|
+
when 'g' then count(e['ref'])
|
37
|
+
when 'note' then handle_note(e)
|
38
|
+
when 't' then handle_t(e)
|
39
|
+
else traverse(e)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def handle_note(e)
|
44
|
+
if %w(inline interlinear).include? e['place']
|
45
|
+
traverse(e)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def handle_t(e)
|
50
|
+
if e.has_attribute? 'place' and e['place'].include? 'foot'
|
51
|
+
return
|
52
|
+
end
|
53
|
+
traverse(e)
|
54
|
+
end
|
55
|
+
|
56
|
+
def handle_text(e)
|
57
|
+
s = e.content().chomp
|
58
|
+
return if s.empty?
|
59
|
+
return if e.parent.name == 'app'
|
60
|
+
|
61
|
+
# cbeta xml 文字之間會有多餘的換行
|
62
|
+
s.gsub!(/[\n\r]/, '')
|
63
|
+
|
64
|
+
s.each_char do |c|
|
65
|
+
next if CBETA::PUNCS.include? c
|
66
|
+
count(c)
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
def stat_all
|
71
|
+
Dir.entries(@xml_root).sort.each do |canon|
|
72
|
+
next if canon.start_with? '.'
|
73
|
+
next if canon == 'schema'
|
74
|
+
stat_canon(canon)
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
def stat_canon(canon)
|
79
|
+
return if canon.nil?
|
80
|
+
puts 'stat canon: ' + canon
|
81
|
+
if @config[:group_by] == 'canon'
|
82
|
+
@result[canon] = {}
|
83
|
+
@current = @result[canon]
|
84
|
+
end
|
85
|
+
folder = File.join(@xml_root, canon)
|
86
|
+
Dir.entries(folder).sort.each do |vol|
|
87
|
+
next if vol.start_with? '.'
|
88
|
+
p = File.join(folder, vol)
|
89
|
+
stat_vol(p)
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
def stat_file(fn)
|
94
|
+
if @config[:group_by] == 'work'
|
95
|
+
work = File.basename(fn, '.xml')
|
96
|
+
work.sub!(/^([A-Z])\d{2,3}n(.*)$/, '\1\2')
|
97
|
+
work = 'T0220' if work.start_with?('T0220')
|
98
|
+
puts "stat work: #{work}"
|
99
|
+
@result[work] = {}
|
100
|
+
@current = @result[work]
|
101
|
+
else
|
102
|
+
puts "stat file: #{fn}"
|
103
|
+
end
|
104
|
+
doc = CBETA.open_xml(fn)
|
105
|
+
body = doc.at_xpath('/TEI/text/body')
|
106
|
+
traverse(body)
|
107
|
+
end
|
108
|
+
|
109
|
+
def stat_vol(vol_folder)
|
110
|
+
Dir.entries(vol_folder).sort.each do |f|
|
111
|
+
next if f.start_with? '.'
|
112
|
+
p = File.join(vol_folder, f)
|
113
|
+
stat_file(p)
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
def traverse(e)
|
118
|
+
e.children.each { |c|
|
119
|
+
handle_node(c)
|
120
|
+
}
|
121
|
+
end
|
122
|
+
|
123
|
+
end
|
data/lib/data/html-for-pdf.css
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cbeta
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.3.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ray Chou
|
@@ -18,6 +18,8 @@ extra_rdoc_files: []
|
|
18
18
|
files:
|
19
19
|
- lib/cbeta.rb
|
20
20
|
- lib/cbeta/bm_to_text.rb
|
21
|
+
- lib/cbeta/char_count.rb
|
22
|
+
- lib/cbeta/char_freq.rb
|
21
23
|
- lib/cbeta/gaiji.rb
|
22
24
|
- lib/cbeta/html_to_pdf.rb
|
23
25
|
- lib/cbeta/html_to_text.rb
|