simple_text_extract 3.0.7 → 3.0.9

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: bad3e0fab883d324b24e9a06cacdceed53de65e51489be88557a3dcdc5bf39b6
4
- data.tar.gz: 2c8b53df878e404f90ca65eacae7ce25bc1663de7f138bed7c8ee744b13ee456
3
+ metadata.gz: b19c5d7467780a0ccb69fb18e29eb81c9acb7e16caf39a93ddeedaf325d541bd
4
+ data.tar.gz: 942c5a13853805d7e66feacf997e5a9181e69ea309a880916ca08653c0e3e8bd
5
5
  SHA512:
6
- metadata.gz: a2e1d6659ddfd9cf3252afc481db1077499ecfab0b4602b52555d6f5f18a29b7a7bbeb671009a0505be547a5ad553accbebf037df3f5a4aec1f9673fc4ff0069
7
- data.tar.gz: c3363629bf98d6fd55ac380830c2b7721903361a021e0ee10c5ddf33566be27199dda7e569cdc6821110a2200d3804d62f85d936a99165b58584df52c955afe1
6
+ metadata.gz: 42303612565dbfe9f23f699c3e554fb9509bbf7165e24d5b4b1fbea3f0ffe0bcd9f8723ac39861341f978b734ab671d17f6aa5a530e6fcf47dddfbe6e796acd1
7
+ data.tar.gz: c07deca30b24a70fef2baf7967bcfb411d4a60bdd24c624ce0a328e83be4ed558dca2628f468fa27b9d66e5e317413539d15d7b1bb551a8ec987d80eb69723a6
data/.rubocop.yml CHANGED
@@ -111,5 +111,8 @@ Lint/MissingSuper:
111
111
  Lint/ConstantDefinitionInBlock:
112
112
  Enabled: false
113
113
 
114
+ Lint/SuppressedException:
115
+ Enabled: false
116
+
114
117
  Style/SingleArgumentDig:
115
118
  Enabled: false
data/Gemfile.lock CHANGED
@@ -1,7 +1,9 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- simple_text_extract (3.0.7)
4
+ simple_text_extract (3.0.9)
5
+ base64
6
+ csv
5
7
  roo (~> 2.10.0)
6
8
  rubyzip (~> 2.3.2)
7
9
  spreadsheet (~> 1.3.0)
@@ -10,61 +12,62 @@ GEM
10
12
  remote: https://rubygems.org/
11
13
  specs:
12
14
  ast (2.4.2)
13
- bigdecimal (3.1.8)
15
+ base64 (0.2.0)
16
+ bigdecimal (3.1.9)
14
17
  coderay (1.1.3)
15
- json (2.7.2)
18
+ csv (3.3.2)
19
+ json (2.9.1)
16
20
  language_server-protocol (3.17.0.3)
17
- memory_profiler (1.0.2)
21
+ memory_profiler (1.1.0)
18
22
  method_source (1.1.0)
19
- minitest (5.24.1)
20
- mocha (2.4.0)
23
+ minitest (5.25.4)
24
+ mocha (2.7.1)
21
25
  ruby2_keywords (>= 0.0.5)
22
- nokogiri (1.16.6-arm64-darwin)
26
+ nokogiri (1.18.1-arm64-darwin)
23
27
  racc (~> 1.4)
24
- nokogiri (1.16.6-x86_64-linux)
28
+ nokogiri (1.18.1-x86_64-linux-gnu)
25
29
  racc (~> 1.4)
26
- parallel (1.25.1)
27
- parser (3.3.4.0)
30
+ parallel (1.26.3)
31
+ parser (3.3.6.0)
28
32
  ast (~> 2.4.1)
29
33
  racc
30
- pry (0.14.2)
34
+ pry (0.15.2)
31
35
  coderay (~> 1.1)
32
36
  method_source (~> 1.0)
33
- racc (1.8.0)
37
+ racc (1.8.1)
34
38
  rainbow (3.1.1)
35
39
  rake (13.2.1)
36
- regexp_parser (2.9.2)
37
- rexml (3.3.1)
38
- strscan
40
+ regexp_parser (2.10.0)
39
41
  roo (2.10.1)
40
42
  nokogiri (~> 1)
41
43
  rubyzip (>= 1.3.0, < 3.0.0)
42
- rubocop (1.65.0)
44
+ rubocop (1.69.2)
43
45
  json (~> 2.3)
44
46
  language_server-protocol (>= 3.17.0)
45
47
  parallel (~> 1.10)
46
48
  parser (>= 3.3.0.2)
47
49
  rainbow (>= 2.2.2, < 4.0)
48
- regexp_parser (>= 2.4, < 3.0)
49
- rexml (>= 3.2.5, < 4.0)
50
- rubocop-ast (>= 1.31.1, < 2.0)
50
+ regexp_parser (>= 2.9.3, < 3.0)
51
+ rubocop-ast (>= 1.36.2, < 2.0)
51
52
  ruby-progressbar (~> 1.7)
52
- unicode-display_width (>= 2.4.0, < 3.0)
53
- rubocop-ast (1.31.3)
53
+ unicode-display_width (>= 2.4.0, < 4.0)
54
+ rubocop-ast (1.37.0)
54
55
  parser (>= 3.3.1.0)
55
56
  ruby-ole (1.2.13.1)
56
57
  ruby-progressbar (1.13.0)
57
58
  ruby2_keywords (0.0.5)
58
59
  rubyzip (2.3.2)
59
- spreadsheet (1.3.1)
60
+ spreadsheet (1.3.3)
60
61
  bigdecimal
61
62
  ruby-ole
62
- strscan (3.1.0)
63
- unicode-display_width (2.5.0)
63
+ unicode-display_width (3.1.3)
64
+ unicode-emoji (~> 4.0, >= 4.0.4)
65
+ unicode-emoji (4.0.4)
64
66
 
65
67
  PLATFORMS
66
68
  arm64-darwin-21
67
69
  arm64-darwin-23
70
+ arm64-darwin-24
68
71
  x86_64-linux
69
72
 
70
73
  DEPENDENCIES
@@ -1,5 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require "open3"
4
+
3
5
  class SimpleTextExtract::Extract # rubocop:disable Metrics/ClassLength
4
6
  def self.formatter(path)
5
7
  case path
@@ -78,7 +80,13 @@ class SimpleTextExtract::Extract # rubocop:disable Metrics/ClassLength
78
80
  def pdf_extract
79
81
  return nil if SimpleTextExtract.missing_dependency?("pdftotext")
80
82
 
81
- `pdftotext #{Shellwords.escape(file.path)} - 2>/dev/null`
83
+ stdout, stderr, status = Open3.capture3("pdftotext #{Shellwords.escape(file.path)} -")
84
+ if status.success?
85
+ stdout
86
+ else
87
+ warn "pdftotext failed: #{stderr}"
88
+ nil
89
+ end
82
90
  end
83
91
 
84
92
  def xlsx_extract
@@ -87,13 +95,14 @@ class SimpleTextExtract::Extract # rubocop:disable Metrics/ClassLength
87
95
  spreadsheet = Roo::Spreadsheet.open(file, only_visible_sheets: true)
88
96
 
89
97
  text = []
90
-
91
98
  spreadsheet.sheets.each_with_index do |name, i|
92
99
  text << "# Sheet Index: #{i}"
93
100
  text << "# Sheet Name: #{name}"
94
101
 
95
102
  spreadsheet.sheet(name)&.each_row_streaming do |row|
96
- text << row.map(&:to_s).join(" ")
103
+ text << row.map do |cell|
104
+ cell.value.to_s
105
+ end.join(" ")
97
106
  end
98
107
  end
99
108
 
@@ -116,7 +125,13 @@ class SimpleTextExtract::Extract # rubocop:disable Metrics/ClassLength
116
125
  def doc_extract
117
126
  return nil if SimpleTextExtract.missing_dependency?("antiword")
118
127
 
119
- `antiword #{Shellwords.escape(file.path)}`
128
+ stdout, stderr, status = Open3.capture3("antiword #{Shellwords.escape(file.path)}")
129
+ if status.success?
130
+ stdout
131
+ else
132
+ warn "antiword failed: #{stderr}"
133
+ nil
134
+ end
120
135
  end
121
136
 
122
137
  def docx_extract
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module SimpleTextExtract
4
- VERSION = "3.0.7"
4
+ VERSION = "3.0.9"
5
5
  end
@@ -27,6 +27,8 @@ Gem::Specification.new do |spec|
27
27
  spec.requirements << "pdftotext/poppler"
28
28
 
29
29
  spec.add_dependency "roo", "~> 2.10.0"
30
- spec.add_dependency "spreadsheet", "~> 1.3.0"
31
30
  spec.add_dependency "rubyzip", "~> 2.3.2"
31
+ spec.add_dependency "spreadsheet", "~> 1.3.0"
32
+ spec.add_dependency "base64"
33
+ spec.add_dependency "csv"
32
34
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: simple_text_extract
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.0.7
4
+ version: 3.0.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nick Weiland
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2024-07-12 00:00:00.000000000 Z
11
+ date: 2024-12-31 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: roo
@@ -24,6 +24,20 @@ dependencies:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
26
  version: 2.10.0
27
+ - !ruby/object:Gem::Dependency
28
+ name: rubyzip
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: 2.3.2
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: 2.3.2
27
41
  - !ruby/object:Gem::Dependency
28
42
  name: spreadsheet
29
43
  requirement: !ruby/object:Gem::Requirement
@@ -39,19 +53,33 @@ dependencies:
39
53
  - !ruby/object:Gem::Version
40
54
  version: 1.3.0
41
55
  - !ruby/object:Gem::Dependency
42
- name: rubyzip
56
+ name: base64
43
57
  requirement: !ruby/object:Gem::Requirement
44
58
  requirements:
45
- - - "~>"
59
+ - - ">="
46
60
  - !ruby/object:Gem::Version
47
- version: 2.3.2
61
+ version: '0'
48
62
  type: :runtime
49
63
  prerelease: false
50
64
  version_requirements: !ruby/object:Gem::Requirement
51
65
  requirements:
52
- - - "~>"
66
+ - - ">="
53
67
  - !ruby/object:Gem::Version
54
- version: 2.3.2
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: csv
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
55
83
  description: Extract text from various file types before resorting to an OCR solution.
56
84
  email:
57
85
  - nickweiland@gmail.com
@@ -97,7 +125,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
97
125
  requirements:
98
126
  - antiword
99
127
  - pdftotext/poppler
100
- rubygems_version: 3.5.15
128
+ rubygems_version: 3.5.21
101
129
  signing_key:
102
130
  specification_version: 4
103
131
  summary: Extract text from various file types before resorting to an OCR solution.