chupa-text 1.3.4 → 1.3.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Rakefile +12 -2
- data/doc/text/news.md +18 -0
- data/lib/chupa-text/decomposers/csv.rb +3 -3
- data/lib/chupa-text/decomposers/office-open-xml-document.rb +1 -1
- data/lib/chupa-text/decomposers/office-open-xml-presentation.rb +2 -2
- data/lib/chupa-text/decomposers/office-open-xml-workbook.rb +3 -3
- data/lib/chupa-text/decomposers/office-open-xml.rb +59 -6
- data/lib/chupa-text/decomposers/opendocument-presentation.rb +2 -2
- data/lib/chupa-text/decomposers/opendocument-spreadsheet.rb +4 -4
- data/lib/chupa-text/decomposers/opendocument-text.rb +2 -2
- data/lib/chupa-text/decomposers/xml.rb +2 -2
- data/lib/chupa-text/sax-parser.rb +17 -6
- data/lib/chupa-text/version.rb +2 -2
- data/test/decomposers/test-office-open-xml-workbook.rb +76 -1
- data/test/fixture/xlsx/complex-shared-strings.xlsx +0 -0
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 111930c1ba73f6eaae79fb538e34a51e72fd29768bca71ec89d43f041c34d960
|
4
|
+
data.tar.gz: 4051f7bc52f057a2e06cd7c7d23d3d8ce1755aae47b2352cc8d9e9e2dddfdfde
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9ef8a9c17bf65b1d1a7e26112390964738985c3c9b761dc37c157292a324045eba7565576523817fe738f6158aa1d0362260cf13d28360baec2aa8e9942f2fdc
|
7
|
+
data.tar.gz: 75f63640af7ad7a55d0925dcd37ee23e7bd64c83291d80f0796d6e18fff86f2b76340170c7fa1be572070a4bdee9502a8aa9dfbf7e9946fb4dbd06e6fdb1edbf
|
data/Rakefile
CHANGED
@@ -1,6 +1,6 @@
|
|
1
|
-
# -*-
|
1
|
+
# -*- ruby -*-
|
2
2
|
#
|
3
|
-
# Copyright (C) 2013 Kouhei
|
3
|
+
# Copyright (C) 2013-2025 Sutou Kouhei <kou@clear-code.com>
|
4
4
|
#
|
5
5
|
# This library is free software; you can redistribute it and/or
|
6
6
|
# modify it under the terms of the GNU Lesser General Public
|
@@ -46,3 +46,13 @@ desc "Run tests"
|
|
46
46
|
task :test do
|
47
47
|
ruby("test/run-test.rb")
|
48
48
|
end
|
49
|
+
|
50
|
+
release_task = Rake.application["release"]
|
51
|
+
# We use Trusted Publishing.
|
52
|
+
release_task.prerequisites.delete("build")
|
53
|
+
release_task.prerequisites.delete("release:rubygem_push")
|
54
|
+
release_task_comment = release_task.comment
|
55
|
+
if release_task_comment
|
56
|
+
release_task.clear_comments
|
57
|
+
release_task.comment = release_task_comment.gsub(/ and build.*$/, "")
|
58
|
+
end
|
data/doc/text/news.md
CHANGED
@@ -1,5 +1,23 @@
|
|
1
1
|
# News
|
2
2
|
|
3
|
+
## 1.3.6: 2025-01-10
|
4
|
+
|
5
|
+
### Fixes
|
6
|
+
|
7
|
+
* xlsx: Fixed a bug that wrong text is extracted with complex shared
|
8
|
+
strings.
|
9
|
+
* Reported by Tomohisa Kusukawa.
|
10
|
+
|
11
|
+
### Thanks
|
12
|
+
|
13
|
+
* Tomohisa Kusukawa
|
14
|
+
|
15
|
+
## 1.3.5: 2024-09-22
|
16
|
+
|
17
|
+
### Improvements
|
18
|
+
|
19
|
+
* Added support for REXML 3.3.2 or later.
|
20
|
+
|
3
21
|
## 1.3.4: 2024-09-22
|
4
22
|
|
5
23
|
### Improvements
|
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright (C) 2013-
|
1
|
+
# Copyright (C) 2013-2024 Sutou Kouhei <kou@clear-code.com>
|
2
2
|
#
|
3
3
|
# This library is free software; you can redistribute it and/or
|
4
4
|
# modify it under the terms of the GNU Lesser General Public
|
@@ -36,7 +36,7 @@ module ChupaText
|
|
36
36
|
end
|
37
37
|
|
38
38
|
def decompose(data)
|
39
|
-
text = ""
|
39
|
+
text = +""
|
40
40
|
data.open do |input|
|
41
41
|
begin
|
42
42
|
csv = ::CSV.new(input, liberal_parsing: true)
|
@@ -68,7 +68,7 @@ module ChupaText
|
|
68
68
|
width, height = data.expected_screenshot_size
|
69
69
|
max_n_lines = 10
|
70
70
|
font_size = height / max_n_lines
|
71
|
-
target_text = ""
|
71
|
+
target_text = +""
|
72
72
|
text.each_line.with_index do |line, i|
|
73
73
|
break if i == max_n_lines
|
74
74
|
target_text << line
|
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright (C) 2019 Kouhei
|
1
|
+
# Copyright (C) 2019-2024 Sutou Kouhei <kou@clear-code.com>
|
2
2
|
#
|
3
3
|
# This library is free software; you can redistribute it and/or
|
4
4
|
# modify it under the terms of the GNU Lesser General Public
|
@@ -56,7 +56,7 @@ module ChupaText
|
|
56
56
|
case entry.zip_path
|
57
57
|
when /\Appt\/slides\/slide(\d+)\.xml/
|
58
58
|
nth_slide = Integer($1, 10)
|
59
|
-
slide_text = ""
|
59
|
+
slide_text = +""
|
60
60
|
extract_text(entry, slide_text)
|
61
61
|
context[:slides] << [nth_slide, slide_text]
|
62
62
|
end
|
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright (C) 2019-
|
1
|
+
# Copyright (C) 2019-2025 Sutou Kouhei <kou@clear-code.com>
|
2
2
|
#
|
3
3
|
# This library is free software; you can redistribute it and/or
|
4
4
|
# modify it under the terms of the GNU Lesser General Public
|
@@ -49,7 +49,7 @@ module ChupaText
|
|
49
49
|
def process_entry(entry, context)
|
50
50
|
case entry.zip_path
|
51
51
|
when "xl/sharedStrings.xml"
|
52
|
-
|
52
|
+
extract_shared_strings(entry, context[:shared_strings])
|
53
53
|
when "xl/workbook.xml"
|
54
54
|
listener = WorkbookListener.new(context[:sheet_names])
|
55
55
|
parse(entry.file_data, listener)
|
@@ -73,7 +73,7 @@ module ChupaText
|
|
73
73
|
sheets = context[:sheets].sort_by(&:first).collect(&:last)
|
74
74
|
sheet_names = context[:sheet_names]
|
75
75
|
sheets.each_with_index do |sheet, i|
|
76
|
-
sheet_text = ""
|
76
|
+
sheet_text = +""
|
77
77
|
sheet.each do |row|
|
78
78
|
row_texts = row.collect do |cell|
|
79
79
|
case cell
|
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright (C) 2019 Kouhei
|
1
|
+
# Copyright (C) 2019-2025 Sutou Kouhei <kou@clear-code.com>
|
2
2
|
#
|
3
3
|
# This library is free software; you can redistribute it and/or
|
4
4
|
# modify it under the terms of the GNU Lesser General Public
|
@@ -70,6 +70,11 @@ module ChupaText
|
|
70
70
|
parse(entry.file_data, listener)
|
71
71
|
end
|
72
72
|
|
73
|
+
def extract_shared_strings(entry, shared_strings)
|
74
|
+
listener = SharedStringsListener.new(shared_strings, @namespace_uri)
|
75
|
+
parse(entry.file_data, listener)
|
76
|
+
end
|
77
|
+
|
73
78
|
def log_tag
|
74
79
|
"[decomposer][office-open-xml]"
|
75
80
|
end
|
@@ -90,26 +95,74 @@ module ChupaText
|
|
90
95
|
end
|
91
96
|
|
92
97
|
def end_element(uri, local_name, qname)
|
98
|
+
if uri == @target_uri
|
99
|
+
case local_name
|
100
|
+
when "p", "br"
|
101
|
+
@output << "\n"
|
102
|
+
when "t"
|
103
|
+
@in_target = false
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
def characters(text)
|
109
|
+
add_text(text)
|
110
|
+
end
|
111
|
+
|
112
|
+
def cdata(content)
|
113
|
+
add_text(content)
|
114
|
+
end
|
115
|
+
|
116
|
+
private
|
117
|
+
def add_text(text)
|
118
|
+
return unless @in_target
|
119
|
+
@output << text
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
class SharedStringsListener < SAXListener
|
124
|
+
def initialize(output, target_uri)
|
125
|
+
@output = output
|
126
|
+
@target_uri = target_uri
|
127
|
+
@tag_stack = []
|
93
128
|
@in_target = false
|
129
|
+
@current_text = +""
|
130
|
+
end
|
131
|
+
|
132
|
+
def start_element(uri, local_name, qname, attributes)
|
133
|
+
@tag_stack << local_name
|
134
|
+
|
135
|
+
return unless uri == @target_uri
|
136
|
+
case local_name
|
137
|
+
when "t"
|
138
|
+
@in_target = true
|
139
|
+
@current_text = +""
|
140
|
+
end
|
141
|
+
end
|
94
142
|
|
143
|
+
def end_element(uri, local_name, qname)
|
95
144
|
return unless uri == @target_uri
|
96
145
|
case local_name
|
97
|
-
when "
|
98
|
-
@
|
146
|
+
when "t"
|
147
|
+
add_text(@current_text)
|
148
|
+
@in_target = false
|
99
149
|
end
|
150
|
+
ensure
|
151
|
+
@tag_stack.pop
|
100
152
|
end
|
101
153
|
|
102
154
|
def characters(text)
|
103
|
-
|
155
|
+
@current_text << text if @in_target
|
104
156
|
end
|
105
157
|
|
106
158
|
def cdata(content)
|
107
|
-
|
159
|
+
@current_text << content if @in_target
|
108
160
|
end
|
109
161
|
|
110
162
|
private
|
111
163
|
def add_text(text)
|
112
|
-
|
164
|
+
parent_tag = @tag_stack[-2]
|
165
|
+
return unless parent_tag == "si"
|
113
166
|
@output << text
|
114
167
|
end
|
115
168
|
end
|
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright (C) 2019 Kouhei
|
1
|
+
# Copyright (C) 2019-2024 Sutou Kouhei <kou@clear-code.com>
|
2
2
|
#
|
3
3
|
# This library is free software; you can redistribute it and/or
|
4
4
|
# modify it under the terms of the GNU Lesser General Public
|
@@ -72,7 +72,7 @@ module ChupaText
|
|
72
72
|
when DRAW_URI
|
73
73
|
case local_name
|
74
74
|
when "page"
|
75
|
-
@slides << {text: ""}
|
75
|
+
@slides << {text: +""}
|
76
76
|
end
|
77
77
|
end
|
78
78
|
end
|
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright (C) 2019 Kouhei
|
1
|
+
# Copyright (C) 2019-2024 Sutou Kouhei <kou@clear-code.com>
|
2
2
|
#
|
3
3
|
# This library is free software; you can redistribute it and/or
|
4
4
|
# modify it under the terms of the GNU Lesser General Public
|
@@ -96,9 +96,9 @@ module ChupaText
|
|
96
96
|
when "table-row"
|
97
97
|
@sheets.last[:rows] << []
|
98
98
|
when "table-cell"
|
99
|
-
@sheets.last[:rows].last << {text: ""}
|
99
|
+
@sheets.last[:rows].last << {text: +""}
|
100
100
|
when "covered-table-cell"
|
101
|
-
@sheets.last[:rows].last << {text: ""}
|
101
|
+
@sheets.last[:rows].last << {text: +""}
|
102
102
|
when "shapes"
|
103
103
|
@in_shapes = true
|
104
104
|
end
|
@@ -116,7 +116,7 @@ module ChupaText
|
|
116
116
|
case local_name
|
117
117
|
when "table"
|
118
118
|
sheet = @sheets.last
|
119
|
-
text = ""
|
119
|
+
text = +""
|
120
120
|
shape_texts = sheet[:shape_texts]
|
121
121
|
unless shape_texts.empty?
|
122
122
|
text << shape_texts.join("\n") << "\n"
|
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright (C) 2019 Kouhei
|
1
|
+
# Copyright (C) 2019-2024 Sutou Kouhei <kou@clear-code.com>
|
2
2
|
#
|
3
3
|
# This library is free software; you can redistribute it and/or
|
4
4
|
# modify it under the terms of the GNU Lesser General Public
|
@@ -29,7 +29,7 @@ module ChupaText
|
|
29
29
|
|
30
30
|
private
|
31
31
|
def process_content(entry, context, &block)
|
32
|
-
context[:text] = ""
|
32
|
+
context[:text] = +""
|
33
33
|
listener = TextListener.new(context[:text])
|
34
34
|
parse(entry.file_data, listener)
|
35
35
|
end
|
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright (C) 2013-
|
1
|
+
# Copyright (C) 2013-2024 Sutou Kouhei <kou@clear-code.com>
|
2
2
|
#
|
3
3
|
# This library is free software; you can redistribute it and/or
|
4
4
|
# modify it under the terms of the GNU Lesser General Public
|
@@ -29,7 +29,7 @@ module ChupaText
|
|
29
29
|
end
|
30
30
|
|
31
31
|
def decompose(data)
|
32
|
-
text = ""
|
32
|
+
text = +""
|
33
33
|
listener = Listener.new(text)
|
34
34
|
data.open do |input|
|
35
35
|
begin
|
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright (C) 2019 Kouhei
|
1
|
+
# Copyright (C) 2019-2024 Sutou Kouhei <kou@clear-code.com>
|
2
2
|
#
|
3
3
|
# This library is free software; you can redistribute it and/or
|
4
4
|
# modify it under the terms of the GNU Lesser General Public
|
@@ -16,6 +16,7 @@
|
|
16
16
|
|
17
17
|
require "cgi/util"
|
18
18
|
require "rexml/parsers/sax2parser"
|
19
|
+
require "rexml/rexml"
|
19
20
|
require "rexml/sax2listener"
|
20
21
|
|
21
22
|
begin
|
@@ -156,12 +157,22 @@ module ChupaText
|
|
156
157
|
@listener.end_element(*args)
|
157
158
|
end
|
158
159
|
|
159
|
-
|
160
|
-
|
161
|
-
|
160
|
+
if (REXML::VERSION <=> "3.3.2") >= 0
|
161
|
+
def characters(text)
|
162
|
+
@listener.characters(text)
|
163
|
+
end
|
164
|
+
|
165
|
+
def cdata(content)
|
166
|
+
@listener.cdata(content)
|
167
|
+
end
|
168
|
+
else
|
169
|
+
def characters(text)
|
170
|
+
@listener.characters(CGI.unescapeHTML(text))
|
171
|
+
end
|
162
172
|
|
163
|
-
|
164
|
-
|
173
|
+
def cdata(content)
|
174
|
+
@listener.cdata(CGI.unescapeHTML(content))
|
175
|
+
end
|
165
176
|
end
|
166
177
|
end
|
167
178
|
end
|
data/lib/chupa-text/version.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright (C) 2013-
|
1
|
+
# Copyright (C) 2013-2024 Sutou Kouhei <kou@clear-code.com>
|
2
2
|
#
|
3
3
|
# This library is free software; you can redistribute it and/or
|
4
4
|
# modify it under the terms of the GNU Lesser General Public
|
@@ -15,5 +15,5 @@
|
|
15
15
|
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
16
16
|
|
17
17
|
module ChupaText
|
18
|
-
VERSION = "1.3.
|
18
|
+
VERSION = "1.3.6"
|
19
19
|
end
|
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright (C) 2019 Kouhei
|
1
|
+
# Copyright (C) 2019-2025 Sutou Kouhei <kou@clear-code.com>
|
2
2
|
#
|
3
3
|
# This library is free software; you can redistribute it and/or
|
4
4
|
# modify it under the terms of the GNU Lesser General Public
|
@@ -153,6 +153,81 @@ class TestDecomposersOfficeOpenXMLWorkbook < Test::Unit::TestCase
|
|
153
153
|
end
|
154
154
|
end
|
155
155
|
|
156
|
+
def test_complex_shared_strings
|
157
|
+
path = fixture_path("xlsx", "complex-shared-strings.xlsx")
|
158
|
+
actual = decompose(path).collect do |data|
|
159
|
+
[
|
160
|
+
data["index"],
|
161
|
+
data["name"],
|
162
|
+
data.body,
|
163
|
+
]
|
164
|
+
end
|
165
|
+
assert_equal([
|
166
|
+
[nil, nil, ""],
|
167
|
+
[
|
168
|
+
0,
|
169
|
+
"新規",
|
170
|
+
"No\t案件番号\t開始日\t期日\tステータス\t備考\n" +
|
171
|
+
"1\t-\t45664\t45672\t対応中\n" +
|
172
|
+
"2\t-\t45664\t45672\t対応中\n" +
|
173
|
+
"3\t-\t45664\t45672\t対応中\n" +
|
174
|
+
"4\t-\t45664\t45666\t対応中\n" +
|
175
|
+
"5\t-\t45664\t45666\t対応中\n" +
|
176
|
+
"6\t-\t45663\t45665\t承認待ち\n" +
|
177
|
+
"7\t-\t45660\t45665\t承認待ち\n" +
|
178
|
+
"8\t-\t45653\t45663\t承認待ち\n" +
|
179
|
+
"9\t-\t45653\t45663\t承認待ち\n" +
|
180
|
+
"10\tPSR2401770\t45652\t45666\t対応中\n",
|
181
|
+
],
|
182
|
+
[
|
183
|
+
1,
|
184
|
+
"全体",
|
185
|
+
"No\t案件番号\t開始日\t期日\tステータス\n" +
|
186
|
+
"1\tPSR2401564\t45617\t45726\t対応中\n" +
|
187
|
+
"2\tPSR2401194\t45553\t45716\t対応中\n" +
|
188
|
+
"3\t-\t45664\t45672\t対応中\n" +
|
189
|
+
"4\t-\t45664\t45672\t対応中\n" +
|
190
|
+
"5\t-\t45664\t45672\t対応中\n" +
|
191
|
+
"6\t-\t45645\t45672\t対応中\n" +
|
192
|
+
"7\tPSR2401746\t45649\t45671\t対応中\n" +
|
193
|
+
"8\t-\t45640\t45667\t対応中\n" +
|
194
|
+
"9\t-\t45635\t45667\t対応中\n" +
|
195
|
+
"10\tPSR2401605\t45623\t45667\t対応中\n" +
|
196
|
+
"11\t-\t45664\t45666\t対応中\n" +
|
197
|
+
"12\t-\t45664\t45666\t対応中\n" +
|
198
|
+
"13\tPSR2401770\t45652\t45666\t対応中\n" +
|
199
|
+
"14\t-\t45645\t45665\t対応中\n" +
|
200
|
+
"15\tPSR2401609\t45624\t45666\t対応中\n",
|
201
|
+
],
|
202
|
+
[
|
203
|
+
2,
|
204
|
+
"案件",
|
205
|
+
"No\t案件番号\t開始日\t対応完了時期想定\n" +
|
206
|
+
"1\tPSR2401244\t45561.40347222222\t45744\n" +
|
207
|
+
"2\tPSR2401592\t45621.598611111112\t45698\n" +
|
208
|
+
"3\tPSR2401682\t45638.40902777778\t45688\n" +
|
209
|
+
"4\tPSR2401706\t45643.383333333331\t45671\n" +
|
210
|
+
"5\tPSR2401779\t45653.490277777775\t45671\n" +
|
211
|
+
"6\tPSR2401805\t45664.436805555553\t調整中\n" +
|
212
|
+
"7\tPSR2400677\t45455.588194444441\t45651\t完了\n" +
|
213
|
+
"8\tPSR2401666\t45636.405555555553\t45653\t完了\n" +
|
214
|
+
"9\tPSR2401714\t45644.630555555559\t45652\t完了\n",
|
215
|
+
],
|
216
|
+
[
|
217
|
+
3,
|
218
|
+
"障害恒久対応・改善対応",
|
219
|
+
"No\t案件番号\t分類\t開始日\t対応完了時期想定\n" +
|
220
|
+
"1\tPSR2401334\t改善対応\t45576.411805555559\t45688\n" +
|
221
|
+
"2\tPSR2401335\t改善対応\t45576.415277777778\t45688\n" +
|
222
|
+
"3\tPSR2401410\t改善対応\t45588.428472222222\t調整中\n" +
|
223
|
+
"4\tPSR2401411\t改善対応\t45588.432638888888\t調整中\n" +
|
224
|
+
"5\tPSR2401718\t障害恒久対応\t45645.386111111111\t45695\n" +
|
225
|
+
"6\tPSR2401807\t障害恒久対応\t45664.546527777777\t調整中\t1/16リリースで調整中\n",
|
226
|
+
],
|
227
|
+
],
|
228
|
+
actual)
|
229
|
+
end
|
230
|
+
|
156
231
|
sub_test_case("invalid") do
|
157
232
|
def test_empty
|
158
233
|
messages = capture_log do
|
Binary file
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: chupa-text
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.3.
|
4
|
+
version: 1.3.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sutou Kouhei
|
8
8
|
bindir: bin
|
9
9
|
cert_chain: []
|
10
|
-
date:
|
10
|
+
date: 2025-01-10 00:00:00.000000000 Z
|
11
11
|
dependencies:
|
12
12
|
- !ruby/object:Gem::Dependency
|
13
13
|
name: archive-zip
|
@@ -189,6 +189,7 @@ files:
|
|
189
189
|
- test/fixture/tar/top-level.tar
|
190
190
|
- test/fixture/tar/utf-8.tar
|
191
191
|
- test/fixture/xlsx/attributes.xlsx
|
192
|
+
- test/fixture/xlsx/complex-shared-strings.xlsx
|
192
193
|
- test/fixture/xlsx/empty.xlsx
|
193
194
|
- test/fixture/xlsx/multi-sheets.xlsx
|
194
195
|
- test/fixture/xlsx/not-shared-cell.xlsx
|
@@ -232,7 +233,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
232
233
|
- !ruby/object:Gem::Version
|
233
234
|
version: '0'
|
234
235
|
requirements: []
|
235
|
-
rubygems_version: 3.6.
|
236
|
+
rubygems_version: 3.6.2
|
236
237
|
specification_version: 4
|
237
238
|
summary: ChupaText is an extensible text extractor. You can plug your custom text
|
238
239
|
extractor in ChupaText. You can write your plugin by Ruby.
|