simple_text_extract 3.0.2 → 3.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 388ec404e856a47441f3bb3e23873d70eff4fabf86d96afcfeb6f7ad324e262d
4
- data.tar.gz: 054f2cdb252b91ae14bdc678af1b75d433bfc6266d68ccace27ce4ec829c6484
3
+ metadata.gz: 1bcf42e8ea86d7b9bb1f14731cbf2219e5d2a3c9bca5288e0441776b9822a835
4
+ data.tar.gz: 3d17a5ec43537ea50c64b8b07237cd21125794084fa964a121267895b5ba5023
5
5
  SHA512:
6
- metadata.gz: 5c5a9d4f02bd00f32495803f60b2552d084f204a39df5c239d29e9be2c04013ae44b865c805c750c3a52a51a5e9b93a358e97d386a351aa16460664e048bac61
7
- data.tar.gz: f7bc4192659abaf6653f58c01b446e11e483dddad2efb91ebf077a88767346f50997d87ba8af5d4ab26e7ac2ed7bf6a4393fecf75b78b1968662a266e251a84e
6
+ metadata.gz: 7f452ddf56c6d464f4d16de10e4c934bd97b78e6adbedb43b163a5511881b81d466f3064e963410957b194420f74c0c6b554356b411f03845b37c417c845244f
7
+ data.tar.gz: be9d14077b31539fe14aec768cea5669958c922f0360eb8125e9289a8df61a36a6a2574b7ea1458eed010cf474ef80d70e7a601083016a0d968e37417c314c30
data/- ADDED
@@ -0,0 +1 @@
1
+ Test
data/CHANGELOG.md CHANGED
@@ -1,4 +1,8 @@
1
- ## 3.0.1 (2023-04-17)
1
+ ## 3.0.3 (2023-04-27)
2
+
3
+ - Add support for extracting text from tables in DOCX files.
4
+
5
+ ## 3.0.2 (2023-04-17)
2
6
 
3
7
  - Coerces filename in `SimpleTextExtract.supports?(filename:)` to string.
4
8
 
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- simple_text_extract (3.0.2)
4
+ simple_text_extract (3.0.4)
5
5
  roo (~> 2.10.0)
6
6
  rubyzip (~> 2.3.2)
7
7
  spreadsheet (~> 1.3.0)
@@ -1,6 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- class SimpleTextExtract::Extract
3
+ class SimpleTextExtract::Extract # rubocop:disable Metrics/ClassLength
4
4
  def self.formatter(path)
5
5
  case path
6
6
  when /.zip$/i
@@ -28,7 +28,7 @@ class SimpleTextExtract::Extract
28
28
  end
29
29
 
30
30
  def to_s
31
- @to_s ||= extract.to_s.gsub(/[^\S\n]+/, " ").gsub(/\s?\n\s+/, "\n").strip
31
+ @to_s ||= extract.to_s.scrub.gsub(/[^\S\n]+/, " ").gsub(/\s?\n\s+/, "\n").strip
32
32
  end
33
33
 
34
34
  private
@@ -133,6 +133,13 @@ class SimpleTextExtract::Extract
133
133
  doc.xpath("//w:document//w:body/w:p").each do |node|
134
134
  result << node.text
135
135
  end
136
+
137
+ doc.xpath("//w:document//w:body//w:tbl").each do |node|
138
+ node.xpath(".//w:tr").each do |row|
139
+ text = row.xpath("w:tc").map(&:text)
140
+ result << text.join(", ")
141
+ end
142
+ end
136
143
  end
137
144
 
138
145
  result.join("\n")
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module SimpleTextExtract
4
- VERSION = "3.0.2"
4
+ VERSION = "3.0.4"
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: simple_text_extract
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.0.2
4
+ version: 3.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nick Weiland
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-04-18 00:00:00.000000000 Z
11
+ date: 2023-06-03 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: roo
@@ -59,6 +59,7 @@ executables: []
59
59
  extensions: []
60
60
  extra_rdoc_files: []
61
61
  files:
62
+ - "-"
62
63
  - ".github/workflows/build.yml"
63
64
  - ".gitignore"
64
65
  - ".rubocop.yml"