act_as_page_extractor 0.6.1 → 0.6.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +80 -71
- data/act_as_page_extractor.gemspec +1 -1
- data/lib/act_as_page_extractor/modules/tools.rb +4 -4
- data/lib/act_as_page_extractor/version.rb +1 -1
- data/spec/spec_helper.rb +6 -2
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d0bd64f8e12d0c7bb3a75893738e30af616e4bcc5b958b18853b35363823b5ef
|
4
|
+
data.tar.gz: d87505b025bd924a545e2f6cbd9071958d65f993a234092f65b3cb7e108b16b1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: af0708407f3b4546424e1666926c248cdb9fe0813ede2dd642d099836282d2f608d8d47edcd5cd513cef9b3ead231c192f6b815ec7721eb141b6820f561d0f30
|
7
|
+
data.tar.gz: 6a5969118ff6a6141aaaec8989e38670f75a817afa230636ed84f9f2a4e7c1f160ee569664f940e1355cfc74dff56e30370ca993fbb38d2a7139c17f56858acf
|
data/README.md
CHANGED
@@ -7,102 +7,111 @@ Library for extracting plain text from documents(files) for further processing (
|
|
7
7
|
|
8
8
|
Install appropriate tools before using:
|
9
9
|
|
10
|
-
|
11
|
-
|
10
|
+
```sh
|
11
|
+
sudo apt-get install zlib1g zlib1g-dev zip rar p7zip-full
|
12
|
+
```
|
12
13
|
Add this line to your application's Gemfile:
|
13
14
|
|
14
|
-
|
15
|
-
|
15
|
+
```rb
|
16
|
+
gem 'act_as_page_extractor'
|
17
|
+
bundle
|
18
|
+
```
|
16
19
|
## Usage
|
17
20
|
|
18
|
-
For example, for model Document we need
|
21
|
+
For example, for model Document in the Rails framework we need run:
|
19
22
|
|
20
|
-
|
21
|
-
|
23
|
+
```sh
|
24
|
+
rails g act_as_page_extractor:migration Document category_id user_id
|
25
|
+
```
|
22
26
|
|
23
27
|
As a result we get two migration files:
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
end
|
45
|
-
|
46
|
-
add_index :extracted_pages, :document_id
|
47
|
-
add_index :extracted_pages, :category_id
|
48
|
-
add_index :extracted_pages, [:document_id, :category_id]
|
49
|
-
add_index :extracted_pages, [:document_id, :page_number]
|
50
|
-
end
|
28
|
+
```rb
|
29
|
+
class AddPageExtractorFields < ActiveRecord::Migration
|
30
|
+
def change
|
31
|
+
add_column :documents, :page_extraction_state, :string, default: ''
|
32
|
+
add_column :documents, :page_extraction_pages, :integer, default: 0
|
33
|
+
add_column :documents, :page_extraction_doctype, :string, default: ''
|
34
|
+
add_column :documents, :page_extraction_filesize, :string, default: ''
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
class CreateExtractedPages < ActiveRecord::Migration
|
39
|
+
def change
|
40
|
+
create_table :extracted_pages do |t|
|
41
|
+
t.text :page
|
42
|
+
t.integer :document_id
|
43
|
+
t.integer :category_id
|
44
|
+
t.integer :user_id
|
45
|
+
t.integer :page_number
|
46
|
+
|
47
|
+
t.timestamps null: false
|
51
48
|
end
|
52
49
|
|
50
|
+
add_index :extracted_pages, :document_id
|
51
|
+
add_index :extracted_pages, :category_id
|
52
|
+
add_index :extracted_pages, [:document_id, :category_id]
|
53
|
+
add_index :extracted_pages, [:document_id, :page_number]
|
54
|
+
end
|
55
|
+
end
|
56
|
+
```
|
53
57
|
|
54
|
-
Model Document must have field which contains path to file(supports [different archive types](https://github.com/phlowerteam/total_compressor) that contains [txt, pdf, doc/x, txt, html, rtf, ...](https://
|
58
|
+
Model Document must have field which contains path to file(supports [different archive types](https://github.com/phlowerteam/total_compressor) that contains [txt, pdf, doc/x, txt, html, rtf, ...](https://docs-old.exoplatform.org/public/index.jsp?topic=%2FPLF41%2FPLFAdminGuide.Configuration.JODConverter.html))
|
55
59
|
|
56
60
|
Add to model next parameters for initializing:
|
57
61
|
|
58
|
-
|
59
|
-
|
62
|
+
```rb
|
63
|
+
class Document < ActiveRecord::Base
|
64
|
+
include ActAsPageExtractor
|
60
65
|
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
66
|
+
act_as_page_extractor options: {
|
67
|
+
document_class: 'Document',
|
68
|
+
save_as_pdf: true,
|
69
|
+
filename: :filename,
|
70
|
+
document_id: :document_id,
|
71
|
+
additional_fields: [:category_id, :user_id],
|
72
|
+
#file_storage: "/full/path/to/documents/storage",
|
73
|
+
#pdf_storage: "/full/path/to/extracted/pdf/storage"
|
74
|
+
}
|
70
75
|
|
71
|
-
|
72
|
-
|
76
|
+
has_many :extracted_pages, dependent: :destroy
|
77
|
+
end
|
78
|
+
```
|
73
79
|
|
74
80
|
Now our instance has few new methods:
|
75
81
|
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
82
|
+
```rb
|
83
|
+
document = Document.first
|
84
|
+
document.page_extract!
|
85
|
+
document.extracted_pages
|
86
|
+
document.pdf_path # if option 'save_as_pdf' is 'true'
|
80
87
|
|
81
|
-
|
82
|
-
|
88
|
+
# Access to pages
|
89
|
+
ExtractedPage.count
|
83
90
|
|
84
|
-
|
85
|
-
|
91
|
+
# Importing whole directory of documents
|
92
|
+
ActAsPageExtractor.import_files('/path/to/foler/with/documents')
|
86
93
|
|
87
|
-
|
88
|
-
|
94
|
+
# We can use cron for run the processing of all the new documents
|
95
|
+
ActAsPageExtractor.start_extraction
|
89
96
|
|
90
|
-
|
91
|
-
|
97
|
+
# Getting statistics information of all documents
|
98
|
+
ActAsPageExtractor.statistics
|
99
|
+
```
|
92
100
|
|
93
|
-
Parameters of initializing
|
101
|
+
Parameters of initializing **act_as_page_extractor**:
|
94
102
|
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
103
|
+
* **document_class** - name of model (e.g. Document)
|
104
|
+
* **save_as_pdf** - boolean [true, false] when we want save temporary pdf
|
105
|
+
* **filename** - name of field which contains access to the file and it should be an object with 'url' method that returns path to file (e.g. CarrierWave object with 'filename.url')
|
106
|
+
* **document_id** - name for saving id
|
107
|
+
* **additional_fields** - additional fields that added to extracted page (e.g. for indexing, etc.)
|
108
|
+
* **file_storage** - path for saving tmp files (by default it is "public")
|
109
|
+
* **pdf_storage** - path for saving pdf (by default it is "public/uploads/extracted/pdf")
|
102
110
|
|
103
111
|
## Run tests
|
104
|
-
|
105
|
-
|
112
|
+
```sh
|
113
|
+
rspec
|
114
|
+
```
|
106
115
|
## Contributing
|
107
116
|
1. Fork it
|
108
117
|
2. Create your feature branch (`git checkout -b my-new-feature`)
|
@@ -115,5 +124,5 @@ https://github.com/phlowerteam
|
|
115
124
|
phlowerteam@gmail.com
|
116
125
|
|
117
126
|
## License
|
118
|
-
Copyright (c)
|
127
|
+
Copyright (c) 2024 PhlowerTeam
|
119
128
|
MIT License
|
@@ -10,7 +10,7 @@ Gem::Specification.new do |spec|
|
|
10
10
|
spec.email = ['phlowerteam@gmail.com']
|
11
11
|
spec.description = %q{Library (Docsplit wrapper) for text extraction from pdf, doc/x, txt files with OpenOffice}
|
12
12
|
spec.summary = %q{Uses system calls}
|
13
|
-
spec.homepage = 'https://github.com/phlowerteam'
|
13
|
+
spec.homepage = 'https://github.com/phlowerteam/act_as_page_extractor'
|
14
14
|
spec.license = 'MIT'
|
15
15
|
|
16
16
|
spec.files = `git ls-files`.split($/)
|
@@ -1,17 +1,17 @@
|
|
1
1
|
require 'timeout'
|
2
2
|
|
3
3
|
module ActAsPageExtractor
|
4
|
+
# :nocov:
|
4
5
|
def timeout_wrapper
|
5
6
|
result = nil
|
6
7
|
begin
|
7
8
|
result = Timeout::timeout(60*5) { yield }
|
8
9
|
rescue
|
9
|
-
# :nocov:
|
10
10
|
ensure
|
11
|
-
# :nocov:
|
12
11
|
result
|
13
12
|
end
|
14
13
|
end
|
14
|
+
# :nocov:
|
15
15
|
|
16
16
|
def is_extracted
|
17
17
|
@pdf_pages.to_i > 0 && self.extracted_pages.count == @pdf_pages
|
@@ -46,11 +46,11 @@ module ActAsPageExtractor
|
|
46
46
|
# ap "@copy_document_path"
|
47
47
|
# ap @copy_document_path
|
48
48
|
# ap "@document_path"
|
49
|
-
|
49
|
+
# ap @document_path
|
50
50
|
# ap "@pdf_path"
|
51
51
|
# ap @pdf_path
|
52
52
|
# ap "@pdf_pages"
|
53
|
-
|
53
|
+
# ap @pdf_pages
|
54
54
|
end
|
55
55
|
# :nocov:
|
56
56
|
end
|
data/spec/spec_helper.rb
CHANGED
@@ -1,7 +1,11 @@
|
|
1
|
-
|
1
|
+
unless ENV['SKIP_COVERAGE']
|
2
2
|
require 'simplecov'
|
3
|
-
SimpleCov.start 'rails'
|
3
|
+
SimpleCov.start 'rails' do
|
4
|
+
add_filter 'vendor'
|
5
|
+
end
|
6
|
+
SimpleCov.minimum_coverage 100
|
4
7
|
end
|
8
|
+
|
5
9
|
require 'rspec'
|
6
10
|
require 'support/models'
|
7
11
|
require 'act_as_page_extractor'
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: act_as_page_extractor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.6.
|
4
|
+
version: 0.6.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- PhlowerTeam
|
@@ -249,7 +249,7 @@ files:
|
|
249
249
|
- test/test-doc-3-pages.rtf
|
250
250
|
- test/test-doc-3-pages.txt
|
251
251
|
- test/test-doc-3-pages.wrong
|
252
|
-
homepage: https://github.com/phlowerteam
|
252
|
+
homepage: https://github.com/phlowerteam/act_as_page_extractor
|
253
253
|
licenses:
|
254
254
|
- MIT
|
255
255
|
metadata: {}
|