artext 0.0.2 → 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 379fd2eab84219f4a82393817354930f7e6cbacd
4
- data.tar.gz: cee44b5160c701f51c1836d6cd0a27d655ecf4bb
2
+ SHA256:
3
+ metadata.gz: 2b5fad951364d5b4d72143b5306bc59138b8ecbc120f34acd826aa7a64694e7c
4
+ data.tar.gz: 71c65f3cb3d0a7451dab3703050cd788454bdfba65981a8e633dc5f62808b059
5
5
  SHA512:
6
- metadata.gz: e505ce5c26c90ecd95ffee18656626954276fc5333e607c158b650f2ded704a7b69a2181904555ae106f25e252488ca878b3aa58bccbd7fda21145c013290ac2
7
- data.tar.gz: a6ffbd0ebe58681e9ecb9e0554e3aa0fc9ff446fe082ce8ab463362e16a83b1d91ddfdc30e6ecb08adf1a1b1e679ee363d2fcebf443b1a27b58fc260aa09d02b
6
+ metadata.gz: ef346ac934aeeb5ad7960b343ab7fe07206ba97da57dd16c02f686028c3ff4678b21be3852dcde619188bbaf5881ef63d89540f7686e924c3bc9267e5b9ca96b
7
+ data.tar.gz: 99b81500ee6ba0aac5a43cf54a3991f07e5856232a6cac49567c242b8195eaae72c49d9d6a4e8f390ae3836f5667bb977adfa944b11d01f1af6be88f57ad936b
@@ -0,0 +1 @@
1
+ *.DS_STORE
@@ -1,45 +1,42 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- artext (0.0.1)
5
- addressable (~> 2.3)
6
- fastimage (~> 1.6)
7
- httparty (~> 0.13)
8
- mini_magick (~> 3.7)
9
- nokogiri (~> 1.6)
4
+ artext (0.0.4)
5
+ addressable (~> 2.5)
6
+ fastimage (~> 2.1)
7
+ httparty (~> 0.14)
8
+ mini_magick (~> 4.8)
9
+ nokogiri (~> 1.8)
10
10
 
11
11
  GEM
12
12
  remote: https://rubygems.org/
13
13
  specs:
14
- addressable (2.3.8)
15
- diff-lcs (1.2.5)
16
- fastimage (1.7.0)
17
- addressable (~> 2.3, >= 2.3.5)
18
- httparty (0.13.5)
19
- json (~> 1.8)
14
+ addressable (2.5.2)
15
+ public_suffix (>= 2.0.2, < 4.0)
16
+ diff-lcs (1.3)
17
+ fastimage (2.1.4)
18
+ httparty (0.16.2)
20
19
  multi_xml (>= 0.5.2)
21
- json (1.8.3)
22
- mini_magick (3.8.1)
23
- subexec (~> 0.2.1)
24
- mini_portile (0.6.2)
25
- multi_xml (0.5.5)
26
- nokogiri (1.6.6.2)
27
- mini_portile (~> 0.6.0)
28
- rake (10.4.2)
29
- rspec (3.3.0)
30
- rspec-core (~> 3.3.0)
31
- rspec-expectations (~> 3.3.0)
32
- rspec-mocks (~> 3.3.0)
33
- rspec-core (3.3.2)
34
- rspec-support (~> 3.3.0)
35
- rspec-expectations (3.3.1)
20
+ mini_magick (4.9.2)
21
+ mini_portile2 (2.3.0)
22
+ multi_xml (0.6.0)
23
+ nokogiri (1.8.5)
24
+ mini_portile2 (~> 2.3.0)
25
+ public_suffix (3.0.3)
26
+ rake (10.5.0)
27
+ rspec (3.8.0)
28
+ rspec-core (~> 3.8.0)
29
+ rspec-expectations (~> 3.8.0)
30
+ rspec-mocks (~> 3.8.0)
31
+ rspec-core (3.8.0)
32
+ rspec-support (~> 3.8.0)
33
+ rspec-expectations (3.8.2)
36
34
  diff-lcs (>= 1.2.0, < 2.0)
37
- rspec-support (~> 3.3.0)
38
- rspec-mocks (3.3.2)
35
+ rspec-support (~> 3.8.0)
36
+ rspec-mocks (3.8.0)
39
37
  diff-lcs (>= 1.2.0, < 2.0)
40
- rspec-support (~> 3.3.0)
41
- rspec-support (3.3.0)
42
- subexec (0.2.3)
38
+ rspec-support (~> 3.8.0)
39
+ rspec-support (3.8.0)
43
40
 
44
41
  PLATFORMS
45
42
  ruby
@@ -51,4 +48,4 @@ DEPENDENCIES
51
48
  rspec (~> 3.3)
52
49
 
53
50
  BUNDLED WITH
54
- 1.10.6
51
+ 1.16.3
data/Rakefile CHANGED
@@ -6,4 +6,6 @@ RSpec::Core::RakeTask.new
6
6
  task :default => :spec
7
7
  task :test => :spec
8
8
 
9
-
9
+ task :console do
10
+ exec "irb -r mygem -I ./lib"
11
+ end
@@ -23,9 +23,9 @@ Gem::Specification.new do |spec|
23
23
  spec.add_development_dependency "rspec", "~> 3.3"
24
24
 
25
25
 
26
- spec.add_dependency "addressable", "~> 2.3"
27
- spec.add_dependency "httparty", "~> 0.13"
28
- spec.add_dependency "fastimage", "~> 1.6"
29
- spec.add_dependency "mini_magick", "~> 3.7"
30
- spec.add_dependency "nokogiri", "~> 1.6"
26
+ spec.add_dependency "addressable", "~> 2.5"
27
+ spec.add_dependency "httparty", "~> 0.14"
28
+ spec.add_dependency "fastimage", "~> 2.1"
29
+ spec.add_dependency "mini_magick", "~> 4.8"
30
+ spec.add_dependency "nokogiri", "~> 1.8"
31
31
  end
@@ -4,6 +4,7 @@ require "httparty"
4
4
  require "nokogiri"
5
5
  require "mini_magick"
6
6
  require "fastimage"
7
+ require "addressable"
7
8
 
8
9
  module Artext
9
10
 
@@ -121,7 +122,7 @@ module Artext
121
122
  if (article.count > 1)
122
123
  article = get_correct_article(article)
123
124
  score = 0.8
124
- end
125
+ end
125
126
  if (is_blank?(article))
126
127
  article = find_article(doc)
127
128
  score = 0.6
@@ -173,7 +174,7 @@ module Artext
173
174
  end
174
175
  return rel, score
175
176
  end
176
-
177
+
177
178
  def self.iteratively_clean(element, html, images, score)
178
179
  html = ""
179
180
  imgs = []
@@ -227,15 +228,17 @@ module Artext
227
228
  elsif (element.name == "p")
228
229
  p_elem, ti = extractp(element, score)
229
230
  tv = "<p>#{p_elem}</p>" if (!is_blank?(p_elem))
231
+ tv = tv.gsub("<p></p>", "")
230
232
  images = images + ti if (!is_blank?(ti))
231
233
  elsif (element.name == "figure")
232
- cap = element.search("figcaption").text.split.join(" ")
234
+ cap = element.search("figcaption").inner_html
233
235
  cap = is_blank?(cap) ? "" : "<figcaption>#{cap}</figcaption>"
234
236
  tv, ti = figurehandle(element, "", [])
235
237
  tv = "<figure>#{tv}#{cap}</figure>" if (!is_blank?(tv))
236
238
  images = images + ti
237
239
  elsif (element.name == "text")
238
240
  tv = element.text.split.join(" ")
241
+ tv = nil if tv == "advertisement"
239
242
  tv = "<p class\"inline\">#{tv}</p>" if (!is_blank?(tv))
240
243
  elsif (element.name == "i")
241
244
  tv = element.text.split.join(" ")
@@ -292,7 +295,7 @@ module Artext
292
295
  end
293
296
 
294
297
  def self.phandle(element, html, images)
295
- if (!is_blank?(element.children) && !(element.name == "a" && is_blank?(element.search("img"))))
298
+ if (!is_blank?(element.children) && !((element.name == "a" || element.name == "figure") && is_blank?(element.search("img"))))
296
299
  element.children.each do |elem|
297
300
  html, images = phandle(elem, html, images)
298
301
  end
@@ -303,6 +306,12 @@ module Artext
303
306
  html = "</p><figure><img src=\"#{img}\"></figure><p>"
304
307
  images << img
305
308
  end
309
+ elsif (element.name == "figure")
310
+ cap = element.search("figcaption").inner_html
311
+ cap = is_blank?(cap) ? "" : "<figcaption>#{cap}</figcaption>"
312
+ tv, ti = figurehandle(element, "", [])
313
+ html = "</p><figure>#{tv}#{cap}</figure><p>" if (!is_blank?(tv))
314
+ images << ti if (!is_blank?(ti))
306
315
  elsif (element.name == "a")
307
316
  html = html + " <a href=\"#{element.attribute("href").value if (!is_blank?(element.attribute("href")))}\">#{element.text.split.join(" ")}</a> "
308
317
  elsif (element.name == "text")
@@ -1,3 +1,3 @@
1
1
  module Artext
2
- VERSION = "0.0.2"
2
+ VERSION = "0.0.4"
3
3
  end
metadata CHANGED
@@ -1,127 +1,127 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: artext
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Anindya Mondal
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-09-14 00:00:00.000000000 Z
11
+ date: 2018-10-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - ~>
17
+ - - "~>"
18
18
  - !ruby/object:Gem::Version
19
19
  version: '1.6'
20
20
  type: :development
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - ~>
24
+ - - "~>"
25
25
  - !ruby/object:Gem::Version
26
26
  version: '1.6'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: rake
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
- - - ~>
31
+ - - "~>"
32
32
  - !ruby/object:Gem::Version
33
33
  version: '10.0'
34
34
  type: :development
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
- - - ~>
38
+ - - "~>"
39
39
  - !ruby/object:Gem::Version
40
40
  version: '10.0'
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: rspec
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
- - - ~>
45
+ - - "~>"
46
46
  - !ruby/object:Gem::Version
47
47
  version: '3.3'
48
48
  type: :development
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
- - - ~>
52
+ - - "~>"
53
53
  - !ruby/object:Gem::Version
54
54
  version: '3.3'
55
55
  - !ruby/object:Gem::Dependency
56
56
  name: addressable
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements:
59
- - - ~>
59
+ - - "~>"
60
60
  - !ruby/object:Gem::Version
61
- version: '2.3'
61
+ version: '2.5'
62
62
  type: :runtime
63
63
  prerelease: false
64
64
  version_requirements: !ruby/object:Gem::Requirement
65
65
  requirements:
66
- - - ~>
66
+ - - "~>"
67
67
  - !ruby/object:Gem::Version
68
- version: '2.3'
68
+ version: '2.5'
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: httparty
71
71
  requirement: !ruby/object:Gem::Requirement
72
72
  requirements:
73
- - - ~>
73
+ - - "~>"
74
74
  - !ruby/object:Gem::Version
75
- version: '0.13'
75
+ version: '0.14'
76
76
  type: :runtime
77
77
  prerelease: false
78
78
  version_requirements: !ruby/object:Gem::Requirement
79
79
  requirements:
80
- - - ~>
80
+ - - "~>"
81
81
  - !ruby/object:Gem::Version
82
- version: '0.13'
82
+ version: '0.14'
83
83
  - !ruby/object:Gem::Dependency
84
84
  name: fastimage
85
85
  requirement: !ruby/object:Gem::Requirement
86
86
  requirements:
87
- - - ~>
87
+ - - "~>"
88
88
  - !ruby/object:Gem::Version
89
- version: '1.6'
89
+ version: '2.1'
90
90
  type: :runtime
91
91
  prerelease: false
92
92
  version_requirements: !ruby/object:Gem::Requirement
93
93
  requirements:
94
- - - ~>
94
+ - - "~>"
95
95
  - !ruby/object:Gem::Version
96
- version: '1.6'
96
+ version: '2.1'
97
97
  - !ruby/object:Gem::Dependency
98
98
  name: mini_magick
99
99
  requirement: !ruby/object:Gem::Requirement
100
100
  requirements:
101
- - - ~>
101
+ - - "~>"
102
102
  - !ruby/object:Gem::Version
103
- version: '3.7'
103
+ version: '4.8'
104
104
  type: :runtime
105
105
  prerelease: false
106
106
  version_requirements: !ruby/object:Gem::Requirement
107
107
  requirements:
108
- - - ~>
108
+ - - "~>"
109
109
  - !ruby/object:Gem::Version
110
- version: '3.7'
110
+ version: '4.8'
111
111
  - !ruby/object:Gem::Dependency
112
112
  name: nokogiri
113
113
  requirement: !ruby/object:Gem::Requirement
114
114
  requirements:
115
- - - ~>
115
+ - - "~>"
116
116
  - !ruby/object:Gem::Version
117
- version: '1.6'
117
+ version: '1.8'
118
118
  type: :runtime
119
119
  prerelease: false
120
120
  version_requirements: !ruby/object:Gem::Requirement
121
121
  requirements:
122
- - - ~>
122
+ - - "~>"
123
123
  - !ruby/object:Gem::Version
124
- version: '1.6'
124
+ version: '1.8'
125
125
  description: Extract article and other metadata from websites.
126
126
  email:
127
127
  - anindyamondal@mazdigital.com
@@ -129,6 +129,7 @@ executables: []
129
129
  extensions: []
130
130
  extra_rdoc_files: []
131
131
  files:
132
+ - ".gitignore"
132
133
  - Gemfile
133
134
  - Gemfile.lock
134
135
  - LICENSE.txt
@@ -147,17 +148,17 @@ require_paths:
147
148
  - lib
148
149
  required_ruby_version: !ruby/object:Gem::Requirement
149
150
  requirements:
150
- - - '>='
151
+ - - ">="
151
152
  - !ruby/object:Gem::Version
152
153
  version: '0'
153
154
  required_rubygems_version: !ruby/object:Gem::Requirement
154
155
  requirements:
155
- - - '>='
156
+ - - ">="
156
157
  - !ruby/object:Gem::Version
157
158
  version: '0'
158
159
  requirements: []
159
160
  rubyforge_project:
160
- rubygems_version: 2.4.8
161
+ rubygems_version: 2.7.7
161
162
  signing_key:
162
163
  specification_version: 4
163
164
  summary: Extract article from websites.