embulk-filter-strip_html_tags 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: a3f9c5cf676a67b3f43b8ff43175e6508f9f5a4b
4
+ data.tar.gz: 8e8461e5a1a699b698ddcf97835603f407080afe
5
+ SHA512:
6
+ metadata.gz: 106d1c7968058015970135185812d2d8fd744cd281fbc4b1752ba2803f75810cc607d6d6cf4faa0b9a112bee04c642fce78899912d9a89dd03d9a6a49c630e9c
7
+ data.tar.gz: 01e04b789d4de23c575b4d4b67c364c731a96ea3ab38a31665bb4e36e20f8dca49993f251675085a217a733643b47f690e21e5b420b272dd02dfd16c6d672252
data/.gitignore ADDED
@@ -0,0 +1,5 @@
1
+ *~
2
+ /pkg/
3
+ /tmp/
4
+ /.bundle/
5
+ /Gemfile.lock
data/.ruby-version ADDED
@@ -0,0 +1 @@
1
+ jruby-9.1.5.0
data/Gemfile ADDED
@@ -0,0 +1,2 @@
1
+ source 'https://rubygems.org/'
2
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+
2
+ MIT License
3
+
4
+ Permission is hereby granted, free of charge, to any person obtaining
5
+ a copy of this software and associated documentation files (the
6
+ "Software"), to deal in the Software without restriction, including
7
+ without limitation the rights to use, copy, modify, merge, publish,
8
+ distribute, sublicense, and/or sell copies of the Software, and to
9
+ permit persons to whom the Software is furnished to do so, subject to
10
+ the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be
13
+ included in all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,56 @@
1
+ # Strip Html Tags filter plugin for Embulk
2
+
3
+ This plugin strips HTML tags from values of specified columns.
4
+
5
+ ## Overview
6
+
7
+ * **Plugin type**: filter
8
+
9
+ ## Configuration
10
+
11
+ - **columns**: column names (array<string>, default: `[]`)
12
+
13
+ ## Example
14
+
15
+ This settings strips tags on column foo and bar, leaves other columns untouched.
16
+
17
+ ```yaml
18
+ in:
19
+ type: file
20
+ path_prefix: ./test.csv
21
+ parser:
22
+ type: csv
23
+ charset: UTF-8
24
+ delimiter: ","
25
+ columns:
26
+ - {name: foo, type: string}
27
+ - {name: bar, type: string}
28
+ - {name: baz, type: string}
29
+
30
+ filters:
31
+ - type: strip_html_tags
32
+ columns:
33
+ - foo
34
+ - bar
35
+
36
+ out:
37
+ type: stdout
38
+ ```
39
+
40
+ it converts a CSV record like this:
41
+
42
+ ```csv
43
+ <a>foo</a>,<div>bar</div>,<p>baz</p>
44
+ ```
45
+
46
+ into:
47
+
48
+ ```
49
+ foo,bar,<p>baz</p>
50
+ ```
51
+
52
+ ## Build
53
+
54
+ ```
55
+ $ rake
56
+ ```
data/Rakefile ADDED
@@ -0,0 +1,3 @@
1
+ require "bundler/gem_tasks"
2
+
3
+ task default: :build
@@ -0,0 +1,21 @@
1
+
2
+ Gem::Specification.new do |spec|
3
+ spec.name = "embulk-filter-strip_html_tags"
4
+ spec.version = "0.1.0"
5
+ spec.authors = ["SAWADA Tadashi"]
6
+ spec.summary = "Strip Html Tags filter plugin for Embulk"
7
+ spec.description = "Strip Html Tags"
8
+ spec.email = ["cesare@mayverse.jp"]
9
+ spec.licenses = ["MIT"]
10
+ spec.homepage = "https://github.com/cesare/embulk-filter-strip_html_tags"
11
+
12
+ spec.files = `git ls-files`.split("\n") + Dir["classpath/*.jar"]
13
+ spec.test_files = spec.files.grep(%r{^(test|spec)/})
14
+ spec.require_paths = ["lib"]
15
+
16
+ spec.add_dependency "nokogiri", ["~> 1.8.0"]
17
+
18
+ spec.add_development_dependency 'embulk', ['>= 0.8.30']
19
+ spec.add_development_dependency 'bundler', ['>= 1.10.6']
20
+ spec.add_development_dependency 'rake', ['>= 10.0']
21
+ end
@@ -0,0 +1,56 @@
1
+ require "nokogiri"
2
+
3
+ module Embulk
4
+ module Filter
5
+
6
+ class StripHtmlTags < FilterPlugin
7
+ Plugin.register_filter("strip_html_tags", self)
8
+
9
+ def self.transaction(config, in_schema, &control)
10
+ task = {
11
+ "columns" => config.param("columns", :array, default: []),
12
+ }
13
+
14
+ out_columns = in_schema
15
+
16
+ yield(task, out_columns)
17
+ end
18
+
19
+ attr_reader :target_columns, :target_fields
20
+
21
+ def init
22
+ @target_columns = task["columns"]
23
+ @target_fields = out_schema.map {|c| @target_columns.include?(c.name) }
24
+ end
25
+
26
+ def close
27
+ end
28
+
29
+ def add(page)
30
+ page.each do |record|
31
+ page_builder.add(fix_record(record))
32
+ end
33
+ end
34
+
35
+ def finish
36
+ page_builder.finish
37
+ end
38
+
39
+ private
40
+
41
+ def fix_record(record)
42
+ record.zip(target_fields).map do |(value, target)|
43
+ if target
44
+ strip_tags(value)
45
+ else
46
+ value
47
+ end
48
+ end
49
+ end
50
+
51
+ def strip_tags(str)
52
+ Nokogiri::HTML.parse(str).text
53
+ end
54
+ end
55
+ end
56
+ end
metadata ADDED
@@ -0,0 +1,108 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: embulk-filter-strip_html_tags
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - SAWADA Tadashi
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2017-11-28 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ requirement: !ruby/object:Gem::Requirement
15
+ requirements:
16
+ - - "~>"
17
+ - !ruby/object:Gem::Version
18
+ version: 1.8.0
19
+ name: nokogiri
20
+ prerelease: false
21
+ type: :runtime
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: 1.8.0
27
+ - !ruby/object:Gem::Dependency
28
+ requirement: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: 0.8.30
33
+ name: embulk
34
+ prerelease: false
35
+ type: :development
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: 0.8.30
41
+ - !ruby/object:Gem::Dependency
42
+ requirement: !ruby/object:Gem::Requirement
43
+ requirements:
44
+ - - ">="
45
+ - !ruby/object:Gem::Version
46
+ version: 1.10.6
47
+ name: bundler
48
+ prerelease: false
49
+ type: :development
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: 1.10.6
55
+ - !ruby/object:Gem::Dependency
56
+ requirement: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - ">="
59
+ - !ruby/object:Gem::Version
60
+ version: '10.0'
61
+ name: rake
62
+ prerelease: false
63
+ type: :development
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '10.0'
69
+ description: Strip Html Tags
70
+ email:
71
+ - cesare@mayverse.jp
72
+ executables: []
73
+ extensions: []
74
+ extra_rdoc_files: []
75
+ files:
76
+ - ".gitignore"
77
+ - ".ruby-version"
78
+ - Gemfile
79
+ - LICENSE.txt
80
+ - README.md
81
+ - Rakefile
82
+ - embulk-filter-strip_html_tags.gemspec
83
+ - lib/embulk/filter/strip_html_tags.rb
84
+ homepage: https://github.com/cesare/embulk-filter-strip_html_tags
85
+ licenses:
86
+ - MIT
87
+ metadata: {}
88
+ post_install_message:
89
+ rdoc_options: []
90
+ require_paths:
91
+ - lib
92
+ required_ruby_version: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ required_rubygems_version: !ruby/object:Gem::Requirement
98
+ requirements:
99
+ - - ">="
100
+ - !ruby/object:Gem::Version
101
+ version: '0'
102
+ requirements: []
103
+ rubyforge_project:
104
+ rubygems_version: 2.6.6
105
+ signing_key:
106
+ specification_version: 4
107
+ summary: Strip Html Tags filter plugin for Embulk
108
+ test_files: []