artext 0.0.2 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 379fd2eab84219f4a82393817354930f7e6cbacd
4
- data.tar.gz: cee44b5160c701f51c1836d6cd0a27d655ecf4bb
2
+ SHA256:
3
+ metadata.gz: 2b5fad951364d5b4d72143b5306bc59138b8ecbc120f34acd826aa7a64694e7c
4
+ data.tar.gz: 71c65f3cb3d0a7451dab3703050cd788454bdfba65981a8e633dc5f62808b059
5
5
  SHA512:
6
- metadata.gz: e505ce5c26c90ecd95ffee18656626954276fc5333e607c158b650f2ded704a7b69a2181904555ae106f25e252488ca878b3aa58bccbd7fda21145c013290ac2
7
- data.tar.gz: a6ffbd0ebe58681e9ecb9e0554e3aa0fc9ff446fe082ce8ab463362e16a83b1d91ddfdc30e6ecb08adf1a1b1e679ee363d2fcebf443b1a27b58fc260aa09d02b
6
+ metadata.gz: ef346ac934aeeb5ad7960b343ab7fe07206ba97da57dd16c02f686028c3ff4678b21be3852dcde619188bbaf5881ef63d89540f7686e924c3bc9267e5b9ca96b
7
+ data.tar.gz: 99b81500ee6ba0aac5a43cf54a3991f07e5856232a6cac49567c242b8195eaae72c49d9d6a4e8f390ae3836f5667bb977adfa944b11d01f1af6be88f57ad936b
@@ -0,0 +1 @@
1
+ *.DS_STORE
@@ -1,45 +1,42 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- artext (0.0.1)
5
- addressable (~> 2.3)
6
- fastimage (~> 1.6)
7
- httparty (~> 0.13)
8
- mini_magick (~> 3.7)
9
- nokogiri (~> 1.6)
4
+ artext (0.0.4)
5
+ addressable (~> 2.5)
6
+ fastimage (~> 2.1)
7
+ httparty (~> 0.14)
8
+ mini_magick (~> 4.8)
9
+ nokogiri (~> 1.8)
10
10
 
11
11
  GEM
12
12
  remote: https://rubygems.org/
13
13
  specs:
14
- addressable (2.3.8)
15
- diff-lcs (1.2.5)
16
- fastimage (1.7.0)
17
- addressable (~> 2.3, >= 2.3.5)
18
- httparty (0.13.5)
19
- json (~> 1.8)
14
+ addressable (2.5.2)
15
+ public_suffix (>= 2.0.2, < 4.0)
16
+ diff-lcs (1.3)
17
+ fastimage (2.1.4)
18
+ httparty (0.16.2)
20
19
  multi_xml (>= 0.5.2)
21
- json (1.8.3)
22
- mini_magick (3.8.1)
23
- subexec (~> 0.2.1)
24
- mini_portile (0.6.2)
25
- multi_xml (0.5.5)
26
- nokogiri (1.6.6.2)
27
- mini_portile (~> 0.6.0)
28
- rake (10.4.2)
29
- rspec (3.3.0)
30
- rspec-core (~> 3.3.0)
31
- rspec-expectations (~> 3.3.0)
32
- rspec-mocks (~> 3.3.0)
33
- rspec-core (3.3.2)
34
- rspec-support (~> 3.3.0)
35
- rspec-expectations (3.3.1)
20
+ mini_magick (4.9.2)
21
+ mini_portile2 (2.3.0)
22
+ multi_xml (0.6.0)
23
+ nokogiri (1.8.5)
24
+ mini_portile2 (~> 2.3.0)
25
+ public_suffix (3.0.3)
26
+ rake (10.5.0)
27
+ rspec (3.8.0)
28
+ rspec-core (~> 3.8.0)
29
+ rspec-expectations (~> 3.8.0)
30
+ rspec-mocks (~> 3.8.0)
31
+ rspec-core (3.8.0)
32
+ rspec-support (~> 3.8.0)
33
+ rspec-expectations (3.8.2)
36
34
  diff-lcs (>= 1.2.0, < 2.0)
37
- rspec-support (~> 3.3.0)
38
- rspec-mocks (3.3.2)
35
+ rspec-support (~> 3.8.0)
36
+ rspec-mocks (3.8.0)
39
37
  diff-lcs (>= 1.2.0, < 2.0)
40
- rspec-support (~> 3.3.0)
41
- rspec-support (3.3.0)
42
- subexec (0.2.3)
38
+ rspec-support (~> 3.8.0)
39
+ rspec-support (3.8.0)
43
40
 
44
41
  PLATFORMS
45
42
  ruby
@@ -51,4 +48,4 @@ DEPENDENCIES
51
48
  rspec (~> 3.3)
52
49
 
53
50
  BUNDLED WITH
54
- 1.10.6
51
+ 1.16.3
data/Rakefile CHANGED
@@ -6,4 +6,6 @@ RSpec::Core::RakeTask.new
6
6
  task :default => :spec
7
7
  task :test => :spec
8
8
 
9
-
9
+ task :console do
10
+ exec "irb -r mygem -I ./lib"
11
+ end
@@ -23,9 +23,9 @@ Gem::Specification.new do |spec|
23
23
  spec.add_development_dependency "rspec", "~> 3.3"
24
24
 
25
25
 
26
- spec.add_dependency "addressable", "~> 2.3"
27
- spec.add_dependency "httparty", "~> 0.13"
28
- spec.add_dependency "fastimage", "~> 1.6"
29
- spec.add_dependency "mini_magick", "~> 3.7"
30
- spec.add_dependency "nokogiri", "~> 1.6"
26
+ spec.add_dependency "addressable", "~> 2.5"
27
+ spec.add_dependency "httparty", "~> 0.14"
28
+ spec.add_dependency "fastimage", "~> 2.1"
29
+ spec.add_dependency "mini_magick", "~> 4.8"
30
+ spec.add_dependency "nokogiri", "~> 1.8"
31
31
  end
@@ -4,6 +4,7 @@ require "httparty"
4
4
  require "nokogiri"
5
5
  require "mini_magick"
6
6
  require "fastimage"
7
+ require "addressable"
7
8
 
8
9
  module Artext
9
10
 
@@ -121,7 +122,7 @@ module Artext
121
122
  if (article.count > 1)
122
123
  article = get_correct_article(article)
123
124
  score = 0.8
124
- end
125
+ end
125
126
  if (is_blank?(article))
126
127
  article = find_article(doc)
127
128
  score = 0.6
@@ -173,7 +174,7 @@ module Artext
173
174
  end
174
175
  return rel, score
175
176
  end
176
-
177
+
177
178
  def self.iteratively_clean(element, html, images, score)
178
179
  html = ""
179
180
  imgs = []
@@ -227,15 +228,17 @@ module Artext
227
228
  elsif (element.name == "p")
228
229
  p_elem, ti = extractp(element, score)
229
230
  tv = "<p>#{p_elem}</p>" if (!is_blank?(p_elem))
231
+ tv = tv.gsub("<p></p>", "")
230
232
  images = images + ti if (!is_blank?(ti))
231
233
  elsif (element.name == "figure")
232
- cap = element.search("figcaption").text.split.join(" ")
234
+ cap = element.search("figcaption").inner_html
233
235
  cap = is_blank?(cap) ? "" : "<figcaption>#{cap}</figcaption>"
234
236
  tv, ti = figurehandle(element, "", [])
235
237
  tv = "<figure>#{tv}#{cap}</figure>" if (!is_blank?(tv))
236
238
  images = images + ti
237
239
  elsif (element.name == "text")
238
240
  tv = element.text.split.join(" ")
241
+ tv = nil if tv == "advertisement"
239
242
  tv = "<p class\"inline\">#{tv}</p>" if (!is_blank?(tv))
240
243
  elsif (element.name == "i")
241
244
  tv = element.text.split.join(" ")
@@ -292,7 +295,7 @@ module Artext
292
295
  end
293
296
 
294
297
  def self.phandle(element, html, images)
295
- if (!is_blank?(element.children) && !(element.name == "a" && is_blank?(element.search("img"))))
298
+ if (!is_blank?(element.children) && !((element.name == "a" || element.name == "figure") && is_blank?(element.search("img"))))
296
299
  element.children.each do |elem|
297
300
  html, images = phandle(elem, html, images)
298
301
  end
@@ -303,6 +306,12 @@ module Artext
303
306
  html = "</p><figure><img src=\"#{img}\"></figure><p>"
304
307
  images << img
305
308
  end
309
+ elsif (element.name == "figure")
310
+ cap = element.search("figcaption").inner_html
311
+ cap = is_blank?(cap) ? "" : "<figcaption>#{cap}</figcaption>"
312
+ tv, ti = figurehandle(element, "", [])
313
+ html = "</p><figure>#{tv}#{cap}</figure><p>" if (!is_blank?(tv))
314
+ images << ti if (!is_blank?(ti))
306
315
  elsif (element.name == "a")
307
316
  html = html + " <a href=\"#{element.attribute("href").value if (!is_blank?(element.attribute("href")))}\">#{element.text.split.join(" ")}</a> "
308
317
  elsif (element.name == "text")
@@ -1,3 +1,3 @@
1
1
  module Artext
2
- VERSION = "0.0.2"
2
+ VERSION = "0.0.4"
3
3
  end
metadata CHANGED
@@ -1,127 +1,127 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: artext
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Anindya Mondal
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-09-14 00:00:00.000000000 Z
11
+ date: 2018-10-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - ~>
17
+ - - "~>"
18
18
  - !ruby/object:Gem::Version
19
19
  version: '1.6'
20
20
  type: :development
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - ~>
24
+ - - "~>"
25
25
  - !ruby/object:Gem::Version
26
26
  version: '1.6'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: rake
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
- - - ~>
31
+ - - "~>"
32
32
  - !ruby/object:Gem::Version
33
33
  version: '10.0'
34
34
  type: :development
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
- - - ~>
38
+ - - "~>"
39
39
  - !ruby/object:Gem::Version
40
40
  version: '10.0'
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: rspec
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
- - - ~>
45
+ - - "~>"
46
46
  - !ruby/object:Gem::Version
47
47
  version: '3.3'
48
48
  type: :development
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
- - - ~>
52
+ - - "~>"
53
53
  - !ruby/object:Gem::Version
54
54
  version: '3.3'
55
55
  - !ruby/object:Gem::Dependency
56
56
  name: addressable
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements:
59
- - - ~>
59
+ - - "~>"
60
60
  - !ruby/object:Gem::Version
61
- version: '2.3'
61
+ version: '2.5'
62
62
  type: :runtime
63
63
  prerelease: false
64
64
  version_requirements: !ruby/object:Gem::Requirement
65
65
  requirements:
66
- - - ~>
66
+ - - "~>"
67
67
  - !ruby/object:Gem::Version
68
- version: '2.3'
68
+ version: '2.5'
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: httparty
71
71
  requirement: !ruby/object:Gem::Requirement
72
72
  requirements:
73
- - - ~>
73
+ - - "~>"
74
74
  - !ruby/object:Gem::Version
75
- version: '0.13'
75
+ version: '0.14'
76
76
  type: :runtime
77
77
  prerelease: false
78
78
  version_requirements: !ruby/object:Gem::Requirement
79
79
  requirements:
80
- - - ~>
80
+ - - "~>"
81
81
  - !ruby/object:Gem::Version
82
- version: '0.13'
82
+ version: '0.14'
83
83
  - !ruby/object:Gem::Dependency
84
84
  name: fastimage
85
85
  requirement: !ruby/object:Gem::Requirement
86
86
  requirements:
87
- - - ~>
87
+ - - "~>"
88
88
  - !ruby/object:Gem::Version
89
- version: '1.6'
89
+ version: '2.1'
90
90
  type: :runtime
91
91
  prerelease: false
92
92
  version_requirements: !ruby/object:Gem::Requirement
93
93
  requirements:
94
- - - ~>
94
+ - - "~>"
95
95
  - !ruby/object:Gem::Version
96
- version: '1.6'
96
+ version: '2.1'
97
97
  - !ruby/object:Gem::Dependency
98
98
  name: mini_magick
99
99
  requirement: !ruby/object:Gem::Requirement
100
100
  requirements:
101
- - - ~>
101
+ - - "~>"
102
102
  - !ruby/object:Gem::Version
103
- version: '3.7'
103
+ version: '4.8'
104
104
  type: :runtime
105
105
  prerelease: false
106
106
  version_requirements: !ruby/object:Gem::Requirement
107
107
  requirements:
108
- - - ~>
108
+ - - "~>"
109
109
  - !ruby/object:Gem::Version
110
- version: '3.7'
110
+ version: '4.8'
111
111
  - !ruby/object:Gem::Dependency
112
112
  name: nokogiri
113
113
  requirement: !ruby/object:Gem::Requirement
114
114
  requirements:
115
- - - ~>
115
+ - - "~>"
116
116
  - !ruby/object:Gem::Version
117
- version: '1.6'
117
+ version: '1.8'
118
118
  type: :runtime
119
119
  prerelease: false
120
120
  version_requirements: !ruby/object:Gem::Requirement
121
121
  requirements:
122
- - - ~>
122
+ - - "~>"
123
123
  - !ruby/object:Gem::Version
124
- version: '1.6'
124
+ version: '1.8'
125
125
  description: Extract article and other metadata from websites.
126
126
  email:
127
127
  - anindyamondal@mazdigital.com
@@ -129,6 +129,7 @@ executables: []
129
129
  extensions: []
130
130
  extra_rdoc_files: []
131
131
  files:
132
+ - ".gitignore"
132
133
  - Gemfile
133
134
  - Gemfile.lock
134
135
  - LICENSE.txt
@@ -147,17 +148,17 @@ require_paths:
147
148
  - lib
148
149
  required_ruby_version: !ruby/object:Gem::Requirement
149
150
  requirements:
150
- - - '>='
151
+ - - ">="
151
152
  - !ruby/object:Gem::Version
152
153
  version: '0'
153
154
  required_rubygems_version: !ruby/object:Gem::Requirement
154
155
  requirements:
155
- - - '>='
156
+ - - ">="
156
157
  - !ruby/object:Gem::Version
157
158
  version: '0'
158
159
  requirements: []
159
160
  rubyforge_project:
160
- rubygems_version: 2.4.8
161
+ rubygems_version: 2.7.7
161
162
  signing_key:
162
163
  specification_version: 4
163
164
  summary: Extract article from websites.