wp2txt 0.6.0 → 0.6.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: c05af7e5c72b073f18b53eca8619212f0928aaa1
4
- data.tar.gz: 02dc116458041b096fd811b3fec6ffb6e6ff3ee7
3
+ metadata.gz: bcfd6986e262e455c100d664583b099d57ff4428
4
+ data.tar.gz: 68c922b43951b7f326b0136681981a166208d0a9
5
5
  SHA512:
6
- metadata.gz: 26479a43e8e3ccebfb6578562d62b7b6d2234e924838774a11d143c8fad7e1952db0278526d202a77aad680926b96baa25c8da9e0c23f916d95ccf614d5e82a3
7
- data.tar.gz: 8959c9b51efb18386cf556c67439332fc43723d8de8abfc014745d0d0ad1f89ba56b6222b6f8d2a33748343e2dbef93752ee104c8a54c57e01fe21bee7e2f892
6
+ metadata.gz: 294e0f8e1d2b37534ad885c617cfbd72ad72144dca6fb01231f6e2cf691a86bf58690f5dd1b2b410f8ee23eb3c74fa3f40e4ca8bbf3f3921ea78295783da5f2e
7
+ data.tar.gz: 71a1b8feca5c3067ff534f0239c4c937485ff3ed8c0a6de793be0b71befd440cb5b1c2c465dc6684c558a53209ab6481dd2e60edfe7fb7cbdd9c6f07416efd24
@@ -0,0 +1,69 @@
1
+ #!/usr/bin/env ruby
2
+ # -*- coding: utf-8 -*-
3
+
4
+ $: << File.join(File.dirname(__FILE__))
5
+ $: << File.join(File.dirname(__FILE__), '..', 'lib')
6
+
7
+ require 'wp2txt'
8
+ require 'wp2txt/utils'
9
+ include Wp2txt
10
+ require 'benchmark'
11
+
12
+ data_dir = File.join(File.dirname(__FILE__), '..', "data")
13
+
14
+ parent = Wp2txt::CmdProgbar.new
15
+ input_file = File.join(data_dir, "testdata.bz2")
16
+ output_dir = data_dir
17
+ tfile_size = 10
18
+ convert = true
19
+ strip_tmarker = true
20
+
21
+
22
+
23
+ Benchmark.bm do |x|
24
+ x.report do
25
+ wpconv = Wp2txt::Runner.new(parent, input_file, output_dir, tfile_size, convert, strip_tmarker)
26
+ wpconv.extract_text do |article|
27
+ title = format_wiki article.title
28
+ title = "[[#{title}]]\n"
29
+
30
+ contents = "\nCATEGORIES: "
31
+ contents += article.categories.join(", ")
32
+ contents += "\n\n"
33
+
34
+ article.elements.each do |e|
35
+ case e.first
36
+ when :mw_heading
37
+ line = format_wiki(e.last)
38
+ when :mw_paragraph
39
+ line = format_wiki(e.last)
40
+ when :mw_table, :mw_htable
41
+ line = format_wiki(e.last)
42
+ when :mw_pre
43
+ line = e.last
44
+ when :mw_quote
45
+ line = format_wiki(e.last)
46
+ when :mw_unordered, :mw_ordered, :mw_definition
47
+ line = format_wiki(e.last)
48
+ when :mw_redirect
49
+ line = format_wiki(e.last)
50
+ line += "\n\n"
51
+ else
52
+ next
53
+ end
54
+ contents += line
55
+ contents = remove_templates(contents)
56
+ end
57
+
58
+ ##### cleanup #####
59
+ if /\A\s*\z/m =~ contents
60
+ result = ""
61
+ else
62
+ result = title + "\n" + contents
63
+ end
64
+ result = result.gsub(/\[ref\]\s*\[\/ref\]/m){""}
65
+ result = result.gsub(/\n\n\n+/m){"\n\n"} + "\n"
66
+ end
67
+ end
68
+ end
69
+
data/bin/wp2txt CHANGED
@@ -45,7 +45,7 @@ output_dir = opts[:output_dir]
45
45
  tfile_size = opts[:file_size]
46
46
  convert = opts[:convert]
47
47
  strip_tmarker = opts[:marker] ? false : true
48
- opt_array = [:title_off, :list, :heading, :table, :template, :redirect]
48
+ opt_array = [:title, :list, :heading, :table, :template, :redirect]
49
49
  config = {}
50
50
  opt_array.each do |opt|
51
51
  config[opt] = opts[opt]
@@ -1,3 +1,3 @@
1
1
  module Wp2txt
2
- VERSION = "0.6.0"
2
+ VERSION = "0.6.1"
3
3
  end
@@ -1,4 +1,4 @@
1
- # -*- coding: utf-8 -*-
1
+ # -*- coding: utf-8 -*-
2
2
  $:.push File.expand_path("../lib", __FILE__)
3
3
  require "wp2txt/version"
4
4
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wp2txt
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.0
4
+ version: 0.6.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Yoichiro Hasebe
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-10-04 00:00:00.000000000 Z
11
+ date: 2014-10-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -99,6 +99,7 @@ description: WP2TXT extracts plain text data from Wikipedia dump file (encoded i
99
99
  email:
100
100
  - yohasebe@gmail.com
101
101
  executables:
102
+ - benchmark.rb
102
103
  - wp2txt
103
104
  extensions: []
104
105
  extra_rdoc_files: []
@@ -108,6 +109,7 @@ files:
108
109
  - LICENSE
109
110
  - README.md
110
111
  - Rakefile
112
+ - bin/benchmark.rb
111
113
  - bin/wp2txt
112
114
  - data/testdata.bz2
113
115
  - lib/wp2txt.rb
@@ -138,7 +140,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
138
140
  version: '0'
139
141
  requirements: []
140
142
  rubyforge_project: wp2txt
141
- rubygems_version: 2.4.1
143
+ rubygems_version: 2.4.2
142
144
  signing_key:
143
145
  specification_version: 4
144
146
  summary: Wikipedia dump to text converter