chupa-text 1.3.4 → 1.3.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d81031d80654a357700972d1443aea5994ba4e3f4a1684e670a19b7041b12073
4
- data.tar.gz: 17d66e6f6660066fd8c4fd83baf01eee8c2296a2a7a777ab7c0db3bbf9917d56
3
+ metadata.gz: 111930c1ba73f6eaae79fb538e34a51e72fd29768bca71ec89d43f041c34d960
4
+ data.tar.gz: 4051f7bc52f057a2e06cd7c7d23d3d8ce1755aae47b2352cc8d9e9e2dddfdfde
5
5
  SHA512:
6
- metadata.gz: e3ab55e7923da653b4c277491712e6564db633b664830ad2153b60b0ada7d09de69576074a0a3592406b03041f09990fa7a0daf0fd3f99d798543d22d5416a2c
7
- data.tar.gz: ab8e04142b49119e51c9d7da40776efc674e75034148670c7f921a06320896426f7aba99a42ff467a97d07bd89c776bc4e04ddfa788ce826d51481baf8890470
6
+ metadata.gz: 9ef8a9c17bf65b1d1a7e26112390964738985c3c9b761dc37c157292a324045eba7565576523817fe738f6158aa1d0362260cf13d28360baec2aa8e9942f2fdc
7
+ data.tar.gz: 75f63640af7ad7a55d0925dcd37ee23e7bd64c83291d80f0796d6e18fff86f2b76340170c7fa1be572070a4bdee9502a8aa9dfbf7e9946fb4dbd06e6fdb1edbf
data/Rakefile CHANGED
@@ -1,6 +1,6 @@
1
- # -*- mode: ruby; coding: utf-8 -*-
1
+ # -*- ruby -*-
2
2
  #
3
- # Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
3
+ # Copyright (C) 2013-2025 Sutou Kouhei <kou@clear-code.com>
4
4
  #
5
5
  # This library is free software; you can redistribute it and/or
6
6
  # modify it under the terms of the GNU Lesser General Public
@@ -46,3 +46,13 @@ desc "Run tests"
46
46
  task :test do
47
47
  ruby("test/run-test.rb")
48
48
  end
49
+
50
+ release_task = Rake.application["release"]
51
+ # We use Trusted Publishing.
52
+ release_task.prerequisites.delete("build")
53
+ release_task.prerequisites.delete("release:rubygem_push")
54
+ release_task_comment = release_task.comment
55
+ if release_task_comment
56
+ release_task.clear_comments
57
+ release_task.comment = release_task_comment.gsub(/ and build.*$/, "")
58
+ end
data/doc/text/news.md CHANGED
@@ -1,5 +1,23 @@
1
1
  # News
2
2
 
3
+ ## 1.3.6: 2025-01-10
4
+
5
+ ### Fixes
6
+
7
+ * xlsx: Fixed a bug that wrong text is extracted with complex shared
8
+ strings.
9
+ * Reported by Tomohisa Kusukawa.
10
+
11
+ ### Thanks
12
+
13
+ * Tomohisa Kusukawa
14
+
15
+ ## 1.3.5: 2024-09-22
16
+
17
+ ### Improvements
18
+
19
+ * Added support for REXML 3.3.2 or later.
20
+
3
21
  ## 1.3.4: 2024-09-22
4
22
 
5
23
  ### Improvements
@@ -1,4 +1,4 @@
1
- # Copyright (C) 2013-2017 Kouhei Sutou <kou@clear-code.com>
1
+ # Copyright (C) 2013-2024 Sutou Kouhei <kou@clear-code.com>
2
2
  #
3
3
  # This library is free software; you can redistribute it and/or
4
4
  # modify it under the terms of the GNU Lesser General Public
@@ -36,7 +36,7 @@ module ChupaText
36
36
  end
37
37
 
38
38
  def decompose(data)
39
- text = ""
39
+ text = +""
40
40
  data.open do |input|
41
41
  begin
42
42
  csv = ::CSV.new(input, liberal_parsing: true)
@@ -68,7 +68,7 @@ module ChupaText
68
68
  width, height = data.expected_screenshot_size
69
69
  max_n_lines = 10
70
70
  font_size = height / max_n_lines
71
- target_text = ""
71
+ target_text = +""
72
72
  text.each_line.with_index do |line, i|
73
73
  break if i == max_n_lines
74
74
  target_text << line
@@ -41,7 +41,7 @@ module ChupaText
41
41
 
42
42
  private
43
43
  def start_decompose(context)
44
- context[:text] = ""
44
+ context[:text] = +""
45
45
  end
46
46
 
47
47
  def process_entry(entry, context)
@@ -1,4 +1,4 @@
1
- # Copyright (C) 2019 Kouhei Sutou <kou@clear-code.com>
1
+ # Copyright (C) 2019-2024 Sutou Kouhei <kou@clear-code.com>
2
2
  #
3
3
  # This library is free software; you can redistribute it and/or
4
4
  # modify it under the terms of the GNU Lesser General Public
@@ -56,7 +56,7 @@ module ChupaText
56
56
  case entry.zip_path
57
57
  when /\Appt\/slides\/slide(\d+)\.xml/
58
58
  nth_slide = Integer($1, 10)
59
- slide_text = ""
59
+ slide_text = +""
60
60
  extract_text(entry, slide_text)
61
61
  context[:slides] << [nth_slide, slide_text]
62
62
  end
@@ -1,4 +1,4 @@
1
- # Copyright (C) 2019-2022 Sutou Kouhei <kou@clear-code.com>
1
+ # Copyright (C) 2019-2025 Sutou Kouhei <kou@clear-code.com>
2
2
  #
3
3
  # This library is free software; you can redistribute it and/or
4
4
  # modify it under the terms of the GNU Lesser General Public
@@ -49,7 +49,7 @@ module ChupaText
49
49
  def process_entry(entry, context)
50
50
  case entry.zip_path
51
51
  when "xl/sharedStrings.xml"
52
- extract_text(entry, context[:shared_strings])
52
+ extract_shared_strings(entry, context[:shared_strings])
53
53
  when "xl/workbook.xml"
54
54
  listener = WorkbookListener.new(context[:sheet_names])
55
55
  parse(entry.file_data, listener)
@@ -73,7 +73,7 @@ module ChupaText
73
73
  sheets = context[:sheets].sort_by(&:first).collect(&:last)
74
74
  sheet_names = context[:sheet_names]
75
75
  sheets.each_with_index do |sheet, i|
76
- sheet_text = ""
76
+ sheet_text = +""
77
77
  sheet.each do |row|
78
78
  row_texts = row.collect do |cell|
79
79
  case cell
@@ -1,4 +1,4 @@
1
- # Copyright (C) 2019 Kouhei Sutou <kou@clear-code.com>
1
+ # Copyright (C) 2019-2025 Sutou Kouhei <kou@clear-code.com>
2
2
  #
3
3
  # This library is free software; you can redistribute it and/or
4
4
  # modify it under the terms of the GNU Lesser General Public
@@ -70,6 +70,11 @@ module ChupaText
70
70
  parse(entry.file_data, listener)
71
71
  end
72
72
 
73
+ def extract_shared_strings(entry, shared_strings)
74
+ listener = SharedStringsListener.new(shared_strings, @namespace_uri)
75
+ parse(entry.file_data, listener)
76
+ end
77
+
73
78
  def log_tag
74
79
  "[decomposer][office-open-xml]"
75
80
  end
@@ -90,26 +95,74 @@ module ChupaText
90
95
  end
91
96
 
92
97
  def end_element(uri, local_name, qname)
98
+ if uri == @target_uri
99
+ case local_name
100
+ when "p", "br"
101
+ @output << "\n"
102
+ when "t"
103
+ @in_target = false
104
+ end
105
+ end
106
+ end
107
+
108
+ def characters(text)
109
+ add_text(text)
110
+ end
111
+
112
+ def cdata(content)
113
+ add_text(content)
114
+ end
115
+
116
+ private
117
+ def add_text(text)
118
+ return unless @in_target
119
+ @output << text
120
+ end
121
+ end
122
+
123
+ class SharedStringsListener < SAXListener
124
+ def initialize(output, target_uri)
125
+ @output = output
126
+ @target_uri = target_uri
127
+ @tag_stack = []
93
128
  @in_target = false
129
+ @current_text = +""
130
+ end
131
+
132
+ def start_element(uri, local_name, qname, attributes)
133
+ @tag_stack << local_name
134
+
135
+ return unless uri == @target_uri
136
+ case local_name
137
+ when "t"
138
+ @in_target = true
139
+ @current_text = +""
140
+ end
141
+ end
94
142
 
143
+ def end_element(uri, local_name, qname)
95
144
  return unless uri == @target_uri
96
145
  case local_name
97
- when "p", "br"
98
- @output << "\n"
146
+ when "t"
147
+ add_text(@current_text)
148
+ @in_target = false
99
149
  end
150
+ ensure
151
+ @tag_stack.pop
100
152
  end
101
153
 
102
154
  def characters(text)
103
- add_text(text)
155
+ @current_text << text if @in_target
104
156
  end
105
157
 
106
158
  def cdata(content)
107
- add_text(content)
159
+ @current_text << content if @in_target
108
160
  end
109
161
 
110
162
  private
111
163
  def add_text(text)
112
- return unless @in_target
164
+ parent_tag = @tag_stack[-2]
165
+ return unless parent_tag == "si"
113
166
  @output << text
114
167
  end
115
168
  end
@@ -1,4 +1,4 @@
1
- # Copyright (C) 2019 Kouhei Sutou <kou@clear-code.com>
1
+ # Copyright (C) 2019-2024 Sutou Kouhei <kou@clear-code.com>
2
2
  #
3
3
  # This library is free software; you can redistribute it and/or
4
4
  # modify it under the terms of the GNU Lesser General Public
@@ -72,7 +72,7 @@ module ChupaText
72
72
  when DRAW_URI
73
73
  case local_name
74
74
  when "page"
75
- @slides << {text: ""}
75
+ @slides << {text: +""}
76
76
  end
77
77
  end
78
78
  end
@@ -1,4 +1,4 @@
1
- # Copyright (C) 2019 Kouhei Sutou <kou@clear-code.com>
1
+ # Copyright (C) 2019-2024 Sutou Kouhei <kou@clear-code.com>
2
2
  #
3
3
  # This library is free software; you can redistribute it and/or
4
4
  # modify it under the terms of the GNU Lesser General Public
@@ -96,9 +96,9 @@ module ChupaText
96
96
  when "table-row"
97
97
  @sheets.last[:rows] << []
98
98
  when "table-cell"
99
- @sheets.last[:rows].last << {text: ""}
99
+ @sheets.last[:rows].last << {text: +""}
100
100
  when "covered-table-cell"
101
- @sheets.last[:rows].last << {text: ""}
101
+ @sheets.last[:rows].last << {text: +""}
102
102
  when "shapes"
103
103
  @in_shapes = true
104
104
  end
@@ -116,7 +116,7 @@ module ChupaText
116
116
  case local_name
117
117
  when "table"
118
118
  sheet = @sheets.last
119
- text = ""
119
+ text = +""
120
120
  shape_texts = sheet[:shape_texts]
121
121
  unless shape_texts.empty?
122
122
  text << shape_texts.join("\n") << "\n"
@@ -1,4 +1,4 @@
1
- # Copyright (C) 2019 Kouhei Sutou <kou@clear-code.com>
1
+ # Copyright (C) 2019-2024 Sutou Kouhei <kou@clear-code.com>
2
2
  #
3
3
  # This library is free software; you can redistribute it and/or
4
4
  # modify it under the terms of the GNU Lesser General Public
@@ -29,7 +29,7 @@ module ChupaText
29
29
 
30
30
  private
31
31
  def process_content(entry, context, &block)
32
- context[:text] = ""
32
+ context[:text] = +""
33
33
  listener = TextListener.new(context[:text])
34
34
  parse(entry.file_data, listener)
35
35
  end
@@ -1,4 +1,4 @@
1
- # Copyright (C) 2013-2019 Kouhei Sutou <kou@clear-code.com>
1
+ # Copyright (C) 2013-2024 Sutou Kouhei <kou@clear-code.com>
2
2
  #
3
3
  # This library is free software; you can redistribute it and/or
4
4
  # modify it under the terms of the GNU Lesser General Public
@@ -29,7 +29,7 @@ module ChupaText
29
29
  end
30
30
 
31
31
  def decompose(data)
32
- text = ""
32
+ text = +""
33
33
  listener = Listener.new(text)
34
34
  data.open do |input|
35
35
  begin
@@ -1,4 +1,4 @@
1
- # Copyright (C) 2019 Kouhei Sutou <kou@clear-code.com>
1
+ # Copyright (C) 2019-2024 Sutou Kouhei <kou@clear-code.com>
2
2
  #
3
3
  # This library is free software; you can redistribute it and/or
4
4
  # modify it under the terms of the GNU Lesser General Public
@@ -16,6 +16,7 @@
16
16
 
17
17
  require "cgi/util"
18
18
  require "rexml/parsers/sax2parser"
19
+ require "rexml/rexml"
19
20
  require "rexml/sax2listener"
20
21
 
21
22
  begin
@@ -156,12 +157,22 @@ module ChupaText
156
157
  @listener.end_element(*args)
157
158
  end
158
159
 
159
- def characters(text)
160
- @listener.characters(CGI.unescapeHTML(text))
161
- end
160
+ if (REXML::VERSION <=> "3.3.2") >= 0
161
+ def characters(text)
162
+ @listener.characters(text)
163
+ end
164
+
165
+ def cdata(content)
166
+ @listener.cdata(content)
167
+ end
168
+ else
169
+ def characters(text)
170
+ @listener.characters(CGI.unescapeHTML(text))
171
+ end
162
172
 
163
- def cdata(content)
164
- @listener.cdata(CGI.unescapeHTML(content))
173
+ def cdata(content)
174
+ @listener.cdata(CGI.unescapeHTML(content))
175
+ end
165
176
  end
166
177
  end
167
178
  end
@@ -1,4 +1,4 @@
1
- # Copyright (C) 2013-2022 Sutou Kouhei <kou@clear-code.com>
1
+ # Copyright (C) 2013-2024 Sutou Kouhei <kou@clear-code.com>
2
2
  #
3
3
  # This library is free software; you can redistribute it and/or
4
4
  # modify it under the terms of the GNU Lesser General Public
@@ -15,5 +15,5 @@
15
15
  # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
16
 
17
17
  module ChupaText
18
- VERSION = "1.3.4"
18
+ VERSION = "1.3.6"
19
19
  end
@@ -1,4 +1,4 @@
1
- # Copyright (C) 2019 Kouhei Sutou <kou@clear-code.com>
1
+ # Copyright (C) 2019-2025 Sutou Kouhei <kou@clear-code.com>
2
2
  #
3
3
  # This library is free software; you can redistribute it and/or
4
4
  # modify it under the terms of the GNU Lesser General Public
@@ -153,6 +153,81 @@ class TestDecomposersOfficeOpenXMLWorkbook < Test::Unit::TestCase
153
153
  end
154
154
  end
155
155
 
156
+ def test_complex_shared_strings
157
+ path = fixture_path("xlsx", "complex-shared-strings.xlsx")
158
+ actual = decompose(path).collect do |data|
159
+ [
160
+ data["index"],
161
+ data["name"],
162
+ data.body,
163
+ ]
164
+ end
165
+ assert_equal([
166
+ [nil, nil, ""],
167
+ [
168
+ 0,
169
+ "新規",
170
+ "No\t案件番号\t開始日\t期日\tステータス\t備考\n" +
171
+ "1\t-\t45664\t45672\t対応中\n" +
172
+ "2\t-\t45664\t45672\t対応中\n" +
173
+ "3\t-\t45664\t45672\t対応中\n" +
174
+ "4\t-\t45664\t45666\t対応中\n" +
175
+ "5\t-\t45664\t45666\t対応中\n" +
176
+ "6\t-\t45663\t45665\t承認待ち\n" +
177
+ "7\t-\t45660\t45665\t承認待ち\n" +
178
+ "8\t-\t45653\t45663\t承認待ち\n" +
179
+ "9\t-\t45653\t45663\t承認待ち\n" +
180
+ "10\tPSR2401770\t45652\t45666\t対応中\n",
181
+ ],
182
+ [
183
+ 1,
184
+ "全体",
185
+ "No\t案件番号\t開始日\t期日\tステータス\n" +
186
+ "1\tPSR2401564\t45617\t45726\t対応中\n" +
187
+ "2\tPSR2401194\t45553\t45716\t対応中\n" +
188
+ "3\t-\t45664\t45672\t対応中\n" +
189
+ "4\t-\t45664\t45672\t対応中\n" +
190
+ "5\t-\t45664\t45672\t対応中\n" +
191
+ "6\t-\t45645\t45672\t対応中\n" +
192
+ "7\tPSR2401746\t45649\t45671\t対応中\n" +
193
+ "8\t-\t45640\t45667\t対応中\n" +
194
+ "9\t-\t45635\t45667\t対応中\n" +
195
+ "10\tPSR2401605\t45623\t45667\t対応中\n" +
196
+ "11\t-\t45664\t45666\t対応中\n" +
197
+ "12\t-\t45664\t45666\t対応中\n" +
198
+ "13\tPSR2401770\t45652\t45666\t対応中\n" +
199
+ "14\t-\t45645\t45665\t対応中\n" +
200
+ "15\tPSR2401609\t45624\t45666\t対応中\n",
201
+ ],
202
+ [
203
+ 2,
204
+ "案件",
205
+ "No\t案件番号\t開始日\t対応完了時期想定\n" +
206
+ "1\tPSR2401244\t45561.40347222222\t45744\n" +
207
+ "2\tPSR2401592\t45621.598611111112\t45698\n" +
208
+ "3\tPSR2401682\t45638.40902777778\t45688\n" +
209
+ "4\tPSR2401706\t45643.383333333331\t45671\n" +
210
+ "5\tPSR2401779\t45653.490277777775\t45671\n" +
211
+ "6\tPSR2401805\t45664.436805555553\t調整中\n" +
212
+ "7\tPSR2400677\t45455.588194444441\t45651\t完了\n" +
213
+ "8\tPSR2401666\t45636.405555555553\t45653\t完了\n" +
214
+ "9\tPSR2401714\t45644.630555555559\t45652\t完了\n",
215
+ ],
216
+ [
217
+ 3,
218
+ "障害恒久対応・改善対応",
219
+ "No\t案件番号\t分類\t開始日\t対応完了時期想定\n" +
220
+ "1\tPSR2401334\t改善対応\t45576.411805555559\t45688\n" +
221
+ "2\tPSR2401335\t改善対応\t45576.415277777778\t45688\n" +
222
+ "3\tPSR2401410\t改善対応\t45588.428472222222\t調整中\n" +
223
+ "4\tPSR2401411\t改善対応\t45588.432638888888\t調整中\n" +
224
+ "5\tPSR2401718\t障害恒久対応\t45645.386111111111\t45695\n" +
225
+ "6\tPSR2401807\t障害恒久対応\t45664.546527777777\t調整中\t1/16リリースで調整中\n",
226
+ ],
227
+ ],
228
+ actual)
229
+ end
230
+
156
231
  sub_test_case("invalid") do
157
232
  def test_empty
158
233
  messages = capture_log do
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: chupa-text
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.4
4
+ version: 1.3.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sutou Kouhei
8
8
  bindir: bin
9
9
  cert_chain: []
10
- date: 2024-09-22 00:00:00.000000000 Z
10
+ date: 2025-01-10 00:00:00.000000000 Z
11
11
  dependencies:
12
12
  - !ruby/object:Gem::Dependency
13
13
  name: archive-zip
@@ -189,6 +189,7 @@ files:
189
189
  - test/fixture/tar/top-level.tar
190
190
  - test/fixture/tar/utf-8.tar
191
191
  - test/fixture/xlsx/attributes.xlsx
192
+ - test/fixture/xlsx/complex-shared-strings.xlsx
192
193
  - test/fixture/xlsx/empty.xlsx
193
194
  - test/fixture/xlsx/multi-sheets.xlsx
194
195
  - test/fixture/xlsx/not-shared-cell.xlsx
@@ -232,7 +233,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
232
233
  - !ruby/object:Gem::Version
233
234
  version: '0'
234
235
  requirements: []
235
- rubygems_version: 3.6.0.dev
236
+ rubygems_version: 3.6.2
236
237
  specification_version: 4
237
238
  summary: ChupaText is an extensible text extractor. You can plug your custom text
238
239
  extractor in ChupaText. You can write your plugin by Ruby.