act_as_page_extractor 0.6.1 → 0.6.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: '0956a6daefbfb71e23d34ba59b5010fee87805166d419725c8d95e116ec54ef7'
4
- data.tar.gz: '096d66ce3473ab91068d35a3b5c00de975ce186093735e59592fdf9863b36aee'
3
+ metadata.gz: d0bd64f8e12d0c7bb3a75893738e30af616e4bcc5b958b18853b35363823b5ef
4
+ data.tar.gz: d87505b025bd924a545e2f6cbd9071958d65f993a234092f65b3cb7e108b16b1
5
5
  SHA512:
6
- metadata.gz: d4c48c1bbcdac244047230e7144504d38b453b4d05f72093dec848b733950522b0498c013be62796ef4cba0af005399aa2d6987cd6ff1c2e9fc249758f8b45e7
7
- data.tar.gz: 136917cec986faeb2ad866bc63bad36a96cdd76e7d129b146c79589a67f9dee69a0f6521c4c958024e54ffcbc441332b2ec89b7d986376becddbefbbebd8efb2
6
+ metadata.gz: af0708407f3b4546424e1666926c248cdb9fe0813ede2dd642d099836282d2f608d8d47edcd5cd513cef9b3ead231c192f6b815ec7721eb141b6820f561d0f30
7
+ data.tar.gz: 6a5969118ff6a6141aaaec8989e38670f75a817afa230636ed84f9f2a4e7c1f160ee569664f940e1355cfc74dff56e30370ca993fbb38d2a7139c17f56858acf
data/README.md CHANGED
@@ -7,102 +7,111 @@ Library for extracting plain text from documents(files) for further processing (
7
7
 
8
8
  Install appropriate tools before using:
9
9
 
10
- sudo apt-get install zlib1g zlib1g-dev zip rar p7zip-full
11
-
10
+ ```sh
11
+ sudo apt-get install zlib1g zlib1g-dev zip rar p7zip-full
12
+ ```
12
13
  Add this line to your application's Gemfile:
13
14
 
14
- gem 'act_as_page_extractor'
15
-
15
+ ```rb
16
+ gem 'act_as_page_extractor'
17
+ bundle
18
+ ```
16
19
  ## Usage
17
20
 
18
- For example, for model Document we need execute:
21
+ For example, for model Document in the Rails framework we need run:
19
22
 
20
- $ bundle
21
- $ rails g act_as_page_extractor:migration Document category_id user_id
23
+ ```sh
24
+ rails g act_as_page_extractor:migration Document category_id user_id
25
+ ```
22
26
 
23
27
  As a result we get two migration files:
24
-
25
- class AddPageExtractorFields < ActiveRecord::Migration
26
- def change
27
- add_column :documents, :page_extraction_state, :string, default: ''
28
- add_column :documents, :page_extraction_pages, :integer, default: 0
29
- add_column :documents, :page_extraction_doctype, :string, default: ''
30
- add_column :documents, :page_extraction_filesize, :string, default: ''
31
- end
32
- end
33
-
34
- class CreateExtractedPages < ActiveRecord::Migration
35
- def change
36
- create_table :extracted_pages do |t|
37
- t.text :page
38
- t.integer :document_id
39
- t.integer :category_id
40
- t.integer :user_id
41
- t.integer :page_number
42
-
43
- t.timestamps null: false
44
- end
45
-
46
- add_index :extracted_pages, :document_id
47
- add_index :extracted_pages, :category_id
48
- add_index :extracted_pages, [:document_id, :category_id]
49
- add_index :extracted_pages, [:document_id, :page_number]
50
- end
28
+ ```rb
29
+ class AddPageExtractorFields < ActiveRecord::Migration
30
+ def change
31
+ add_column :documents, :page_extraction_state, :string, default: ''
32
+ add_column :documents, :page_extraction_pages, :integer, default: 0
33
+ add_column :documents, :page_extraction_doctype, :string, default: ''
34
+ add_column :documents, :page_extraction_filesize, :string, default: ''
35
+ end
36
+ end
37
+
38
+ class CreateExtractedPages < ActiveRecord::Migration
39
+ def change
40
+ create_table :extracted_pages do |t|
41
+ t.text :page
42
+ t.integer :document_id
43
+ t.integer :category_id
44
+ t.integer :user_id
45
+ t.integer :page_number
46
+
47
+ t.timestamps null: false
51
48
  end
52
49
 
50
+ add_index :extracted_pages, :document_id
51
+ add_index :extracted_pages, :category_id
52
+ add_index :extracted_pages, [:document_id, :category_id]
53
+ add_index :extracted_pages, [:document_id, :page_number]
54
+ end
55
+ end
56
+ ```
53
57
 
54
- Model Document must have field which contains path to file(supports [different archive types](https://github.com/phlowerteam/total_compressor) that contains [txt, pdf, doc/x, txt, html, rtf, ...](https://www.exoplatform.com/docs/public/index.jsp?topic=%2FPLF43%2FPLFAdminGuide.Configuration.JODConverter.html))
58
+ Model Document must have field which contains path to file(supports [different archive types](https://github.com/phlowerteam/total_compressor) that contains [txt, pdf, doc/x, txt, html, rtf, ...](https://docs-old.exoplatform.org/public/index.jsp?topic=%2FPLF41%2FPLFAdminGuide.Configuration.JODConverter.html))
55
59
 
56
60
  Add to model next parameters for initializing:
57
61
 
58
- class Document < ActiveRecord::Base
59
- include ActAsPageExtractor
62
+ ```rb
63
+ class Document < ActiveRecord::Base
64
+ include ActAsPageExtractor
60
65
 
61
- act_as_page_extractor options: {
62
- document_class: 'Document',
63
- save_as_pdf: true,
64
- filename: :filename,
65
- document_id: :document_id,
66
- additional_fields: [:category_id, :user_id],
67
- #file_storage: "/full/path/to/documents/storage",
68
- #pdf_storage: "/full/path/to/extracted/pdf/storage"
69
- }
66
+ act_as_page_extractor options: {
67
+ document_class: 'Document',
68
+ save_as_pdf: true,
69
+ filename: :filename,
70
+ document_id: :document_id,
71
+ additional_fields: [:category_id, :user_id],
72
+ #file_storage: "/full/path/to/documents/storage",
73
+ #pdf_storage: "/full/path/to/extracted/pdf/storage"
74
+ }
70
75
 
71
- has_many :extracted_pages, dependent: :destroy
72
- end
76
+ has_many :extracted_pages, dependent: :destroy
77
+ end
78
+ ```
73
79
 
74
80
  Now our instance has few new methods:
75
81
 
76
- document = Document.first
77
- document.page_extract!
78
- document.extracted_pages
79
- document.pdf_path # if option 'save_as_pdf' is 'true'
82
+ ```rb
83
+ document = Document.first
84
+ document.page_extract!
85
+ document.extracted_pages
86
+ document.pdf_path # if option 'save_as_pdf' is 'true'
80
87
 
81
- # Access to pages
82
- ExtractedPage.count
88
+ # Access to pages
89
+ ExtractedPage.count
83
90
 
84
- # Importing whole directory of documents
85
- ActAsPageExtractor.import_files('/path/to/foler/with/documents')
91
+ # Importing whole directory of documents
92
+ ActAsPageExtractor.import_files('/path/to/foler/with/documents')
86
93
 
87
- # We can use cron for run the processing of all the new documents
88
- ActAsPageExtractor.start_extraction
94
+ # We can use cron for run the processing of all the new documents
95
+ ActAsPageExtractor.start_extraction
89
96
 
90
- # Getting statistics information of all documents
91
- ActAsPageExtractor.statistics
97
+ # Getting statistics information of all documents
98
+ ActAsPageExtractor.statistics
99
+ ```
92
100
 
93
- Parameters of initializing `act_as_page_extractor options: { ... }`:
101
+ Parameters of initializing **act_as_page_extractor**:
94
102
 
95
- `document_class` - name of model (e.g. 'Document)
96
- `save_as_pdf` - boolean [true, false] when we want save temporary pdf
97
- `filename` - name of field which contains access to the file and it should be an object with 'url' method that returns path to file (e.g. CarrierWave object with 'filename.url')
98
- `document_id` - name for saving id
99
- `additional_fields` - additional fields that added to extracted page (e.g. for indexing, etc.)
100
- `file_storage` - path for saving tmp files (by default it is "public")
101
- `pdf_storage` - path for saving pdf (by default it is "public/uploads/extracted/pdf")
103
+ * **document_class** - name of model (e.g. Document)
104
+ * **save_as_pdf** - boolean [true, false] when we want save temporary pdf
105
+ * **filename** - name of field which contains access to the file and it should be an object with 'url' method that returns path to file (e.g. CarrierWave object with 'filename.url')
106
+ * **document_id** - name for saving id
107
+ * **additional_fields** - additional fields that added to extracted page (e.g. for indexing, etc.)
108
+ * **file_storage** - path for saving tmp files (by default it is "public")
109
+ * **pdf_storage** - path for saving pdf (by default it is "public/uploads/extracted/pdf")
102
110
 
103
111
  ## Run tests
104
- $ COVERAGE=true rspec
105
-
112
+ ```sh
113
+ rspec
114
+ ```
106
115
  ## Contributing
107
116
  1. Fork it
108
117
  2. Create your feature branch (`git checkout -b my-new-feature`)
@@ -115,5 +124,5 @@ https://github.com/phlowerteam
115
124
  phlowerteam@gmail.com
116
125
 
117
126
  ## License
118
- Copyright (c) 2017 PhlowerTeam
127
+ Copyright (c) 2024 PhlowerTeam
119
128
  MIT License
@@ -10,7 +10,7 @@ Gem::Specification.new do |spec|
10
10
  spec.email = ['phlowerteam@gmail.com']
11
11
  spec.description = %q{Library (Docsplit wrapper) for text extraction from pdf, doc/x, txt files with OpenOffice}
12
12
  spec.summary = %q{Uses system calls}
13
- spec.homepage = 'https://github.com/phlowerteam'
13
+ spec.homepage = 'https://github.com/phlowerteam/act_as_page_extractor'
14
14
  spec.license = 'MIT'
15
15
 
16
16
  spec.files = `git ls-files`.split($/)
@@ -1,17 +1,17 @@
1
1
  require 'timeout'
2
2
 
3
3
  module ActAsPageExtractor
4
+ # :nocov:
4
5
  def timeout_wrapper
5
6
  result = nil
6
7
  begin
7
8
  result = Timeout::timeout(60*5) { yield }
8
9
  rescue
9
- # :nocov:
10
10
  ensure
11
- # :nocov:
12
11
  result
13
12
  end
14
13
  end
14
+ # :nocov:
15
15
 
16
16
  def is_extracted
17
17
  @pdf_pages.to_i > 0 && self.extracted_pages.count == @pdf_pages
@@ -46,11 +46,11 @@ module ActAsPageExtractor
46
46
  # ap "@copy_document_path"
47
47
  # ap @copy_document_path
48
48
  # ap "@document_path"
49
- ap @document_path
49
+ # ap @document_path
50
50
  # ap "@pdf_path"
51
51
  # ap @pdf_path
52
52
  # ap "@pdf_pages"
53
- ap @pdf_pages
53
+ # ap @pdf_pages
54
54
  end
55
55
  # :nocov:
56
56
  end
@@ -1,5 +1,5 @@
1
1
  # :nocov:
2
2
  module ActAsPageExtractor
3
- VERSION = "0.6.1"
3
+ VERSION = "0.6.2"
4
4
  end
5
5
  # :nocov:
data/spec/spec_helper.rb CHANGED
@@ -1,7 +1,11 @@
1
- if ENV['COVERAGE']
1
+ unless ENV['SKIP_COVERAGE']
2
2
  require 'simplecov'
3
- SimpleCov.start 'rails'
3
+ SimpleCov.start 'rails' do
4
+ add_filter 'vendor'
5
+ end
6
+ SimpleCov.minimum_coverage 100
4
7
  end
8
+
5
9
  require 'rspec'
6
10
  require 'support/models'
7
11
  require 'act_as_page_extractor'
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: act_as_page_extractor
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.1
4
+ version: 0.6.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - PhlowerTeam
@@ -249,7 +249,7 @@ files:
249
249
  - test/test-doc-3-pages.rtf
250
250
  - test/test-doc-3-pages.txt
251
251
  - test/test-doc-3-pages.wrong
252
- homepage: https://github.com/phlowerteam
252
+ homepage: https://github.com/phlowerteam/act_as_page_extractor
253
253
  licenses:
254
254
  - MIT
255
255
  metadata: {}